<!doctype html>
<html lang="en"> 
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <script
      id="p5scripttag"
      src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.min.js"
      integrity="sha512-uaz5GpnQoE6t5echKlX8P52czvsIGgLPcvlzfvRubLZ1Hp8JemUDnbUiAahbVtPb+jUVrNETuXvAhDDF/N3M4w=="
      crossorigin="anonymous"
      referrerpolicy="no-referrer"
    ></script>

    <link
      rel="stylesheet"
      href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-dark.min.css"
    />
    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>

    <script>
      const bgCol = "#F2F0E7";
const accentCol = "#fd4578";

hljs.initHighlightingOnLoad();

const updateTargetDims = () => {
  // width is max-width of `.contentContainer` - its padding
  // return [min(windowWidth, 900 - 80), 700]
  return [windowWidth * (1 / 2), windowHeight];
};

const setCodeAndPlan = (code, plan) => {
  const codeElm = document.getElementById("code");
  if (codeElm) {
    // codeElm.innerText = code;
    codeElm.innerHTML = hljs.highlight(code, { language: "python" }).value;
  }

  const planElm = document.getElementById("plan");
  if (planElm) {
    // planElm.innerText = plan.trim();
    planElm.innerHTML = hljs.highlight(plan, { language: "plaintext" }).value;
  }
};

windowResized = () => {
  resizeCanvas(...updateTargetDims());
  awaitingPostResizeOps = true;
};

const animEase = (t) => 1 - (1 - Math.min(t, 1.0)) ** 5;

// ---- global constants ----

const globalAnimSpeed = 1.1;
const scaleFactor = 0.57;

// ---- global vars ----

let globalTime = 0;
let manualSelection = false;

let currentElemInd = 0;

let treeStructData = {"edges": [[2, 17], [2, 112], [2, 56], [2, 83], [2, 74], [2, 7], [2, 47], [2, 81], [2, 102], [2, 126], [2, 41], [2, 128], [2, 40], [2, 26], [2, 29], [2, 97], [2, 18], [2, 33], [2, 114], [2, 136], [2, 88], [2, 127], [2, 124], [2, 111], [2, 19], [2, 117], [2, 58], [2, 73], [2, 46], [2, 66], [2, 120], [2, 63], [2, 140], [2, 39], [2, 71], [2, 79], [2, 54], [2, 137], [2, 86], [2, 118], [2, 23], [2, 146], [2, 144], [2, 148], [2, 24], [2, 109], [2, 14], [2, 87], [2, 123], [2, 105], [2, 34], [2, 103], [2, 84], [2, 68], [2, 113], [2, 135], [2, 139], [2, 147], [2, 48], [2, 32], [2, 25], [2, 78], [2, 60], [2, 43], [2, 28], [2, 64], [2, 38], [2, 67], [2, 12], [2, 94], [2, 110], [2, 145], [2, 80], [2, 13], [2, 93], [2, 95], [2, 52], [2, 115], [2, 62], [2, 129], [2, 89], [2, 6], [2, 149], [2, 45], [2, 65], [2, 130], [2, 134], [2, 108], [2, 76], [2, 90], [2, 77], [2, 36], [2, 122], [2, 22], [2, 53], [2, 107], [2, 92], [2, 72], [2, 119], [2, 125], [2, 42], [2, 132], [2, 55], [2, 69], [2, 143], [2, 99], [2, 98], [2, 96], [2, 15], [2, 138], [2, 82], [2, 31], [2, 44], [2, 91], [2, 16], [2, 100], [2, 20], [2, 61], [2, 101], [2, 141], [2, 50], [2, 142], [2, 104], [2, 37], [2, 75], [2, 27], [2, 70], [2, 59], [2, 51], [2, 121], [2, 133], [2, 57], [2, 11], [2, 49], [2, 131], [2, 116], [2, 21], [3, 8], [4, 5], [8, 9], [9, 10], [29, 30], [34, 35], [81, 85], [105, 106]], "layout": [[0.4782608695652174, 0.0], [0.4855072463768116, 0.0], [0.4927536231884058, 0.0], [0.9927536231884058, 0.0], [1.0, 0.0], [1.0, 0.33333333333333337], [0.0, 0.33333333333333337], [0.007246376811594203, 0.33333333333333337], [0.9927536231884058, 0.33333333333333337], [0.9927536231884058, 0.6666666666666667], [0.9927536231884058, 1.0], [0.014492753623188406, 0.33333333333333337], [0.021739130434782608, 0.33333333333333337], [0.028985507246376812, 0.33333333333333337], [0.036231884057971016, 0.33333333333333337], [0.043478260869565216, 0.33333333333333337], [0.050724637681159424, 0.33333333333333337], [0.057971014492753624, 0.33333333333333337], [0.06521739130434782, 0.33333333333333337], [0.07246376811594203, 0.33333333333333337], [0.07971014492753623, 0.33333333333333337], [0.08695652173913043, 0.33333333333333337], [0.09420289855072464, 0.33333333333333337], [0.10144927536231885, 0.33333333333333337], [0.10869565217391304, 0.33333333333333337], [0.11594202898550725, 0.33333333333333337], [0.12318840579710146, 0.33333333333333337], [0.13043478260869565, 0.33333333333333337], [0.13768115942028986, 0.33333333333333337], [0.14492753623188406, 0.33333333333333337], [0.14492753623188406, 0.6666666666666667], [0.15217391304347827, 0.33333333333333337], [0.15942028985507245, 0.33333333333333337], [0.16666666666666666, 0.33333333333333337], [0.17391304347826086, 0.33333333333333337], [0.17391304347826086, 0.6666666666666667], [0.18115942028985507, 0.33333333333333337], [0.18840579710144928, 0.33333333333333337], [0.1956521739130435, 0.33333333333333337], [0.2028985507246377, 0.33333333333333337], [0.21014492753623187, 0.33333333333333337], [0.21739130434782608, 0.33333333333333337], [0.2246376811594203, 0.33333333333333337], [0.2318840579710145, 0.33333333333333337], [0.2391304347826087, 0.33333333333333337], [0.2463768115942029, 0.33333333333333337], [0.2536231884057971, 0.33333333333333337], [0.2608695652173913, 0.33333333333333337], [0.26811594202898553, 0.33333333333333337], [0.2753623188405797, 0.33333333333333337], [0.2826086956521739, 0.33333333333333337], [0.2898550724637681, 0.33333333333333337], [0.2971014492753623, 0.33333333333333337], [0.30434782608695654, 0.33333333333333337], [0.3115942028985507, 0.33333333333333337], [0.3188405797101449, 0.33333333333333337], [0.32608695652173914, 0.33333333333333337], [0.3333333333333333, 0.33333333333333337], [0.34057971014492755, 0.33333333333333337], [0.34782608695652173, 0.33333333333333337], [0.35507246376811596, 0.33333333333333337], [0.36231884057971014, 0.33333333333333337], [0.3695652173913043, 0.33333333333333337], [0.37681159420289856, 0.33333333333333337], [0.38405797101449274, 0.33333333333333337], [0.391304347826087, 0.33333333333333337], [0.39855072463768115, 0.33333333333333337], [0.4057971014492754, 0.33333333333333337], [0.41304347826086957, 0.33333333333333337], [0.42028985507246375, 0.33333333333333337], [0.427536231884058, 0.33333333333333337], [0.43478260869565216, 0.33333333333333337], [0.4420289855072464, 0.33333333333333337], [0.4492753623188406, 0.33333333333333337], [0.45652173913043476, 0.33333333333333337], [0.463768115942029, 0.33333333333333337], [0.47101449275362317, 0.33333333333333337], [0.4782608695652174, 0.33333333333333337], [0.4855072463768116, 0.33333333333333337], [0.4927536231884058, 0.33333333333333337], [0.5, 0.33333333333333337], [0.5072463768115942, 0.33333333333333337], [0.5144927536231884, 0.33333333333333337], [0.5217391304347826, 0.33333333333333337], [0.5289855072463768, 0.33333333333333337], [0.5072463768115942, 0.6666666666666667], [0.5362318840579711, 0.33333333333333337], [0.5434782608695652, 0.33333333333333337], [0.5507246376811594, 0.33333333333333337], [0.5579710144927537, 0.33333333333333337], [0.5652173913043478, 0.33333333333333337], [0.572463768115942, 0.33333333333333337], [0.5797101449275363, 0.33333333333333337], [0.5869565217391305, 0.33333333333333337], [0.5942028985507246, 0.33333333333333337], [0.6014492753623188, 0.33333333333333337], [0.6086956521739131, 0.33333333333333337], [0.6159420289855072, 0.33333333333333337], [0.6231884057971014, 0.33333333333333337], [0.6304347826086957, 0.33333333333333337], [0.6376811594202898, 0.33333333333333337], [0.644927536231884, 0.33333333333333337], [0.6521739130434783, 0.33333333333333337], [0.6594202898550725, 0.33333333333333337], [0.6666666666666666, 0.33333333333333337], [0.6739130434782609, 0.33333333333333337], [0.6739130434782609, 0.6666666666666667], [0.6811594202898551, 0.33333333333333337], [0.6884057971014492, 0.33333333333333337], [0.6956521739130435, 0.33333333333333337], [0.7028985507246377, 0.33333333333333337], [0.7101449275362319, 0.33333333333333337], [0.717391304347826, 0.33333333333333337], [0.7246376811594203, 0.33333333333333337], [0.7318840579710145, 0.33333333333333337], [0.7391304347826086, 0.33333333333333337], [0.7463768115942029, 0.33333333333333337], [0.7536231884057971, 0.33333333333333337], [0.7608695652173914, 0.33333333333333337], [0.7681159420289855, 0.33333333333333337], [0.7753623188405797, 0.33333333333333337], [0.782608695652174, 0.33333333333333337], [0.7898550724637681, 0.33333333333333337], [0.7971014492753623, 0.33333333333333337], [0.8043478260869565, 0.33333333333333337], [0.8115942028985508, 0.33333333333333337], [0.8188405797101449, 0.33333333333333337], [0.8260869565217391, 0.33333333333333337], [0.8333333333333334, 0.33333333333333337], [0.8405797101449275, 0.33333333333333337], [0.8478260869565217, 0.33333333333333337], [0.855072463768116, 0.33333333333333337], [0.8623188405797102, 0.33333333333333337], [0.8695652173913043, 0.33333333333333337], [0.8768115942028986, 0.33333333333333337], [0.8840579710144928, 0.33333333333333337], [0.8913043478260869, 0.33333333333333337], [0.8985507246376812, 0.33333333333333337], [0.9057971014492754, 0.33333333333333337], [0.9130434782608695, 0.33333333333333337], [0.9202898550724637, 0.33333333333333337], [0.927536231884058, 0.33333333333333337], [0.9347826086956522, 0.33333333333333337], [0.9420289855072463, 0.33333333333333337], [0.9492753623188406, 0.33333333333333337], [0.9565217391304348, 0.33333333333333337], [0.9637681159420289, 0.33333333333333337], [0.9710144927536232, 0.33333333333333337], [0.9782608695652174, 0.33333333333333337], [0.9855072463768116, 0.33333333333333337]], "plan": ["To solve the problem of single-cell RNA-seq label projection, I propose using a\nsimple K-Nearest Neighbors (KNN) approach. We will utilize the normalized\nexpression data from the training and test datasets to project the cell type\nlabels from the training set onto the test set. The KNN algorithm will find the\nnearest neighbors in the training data for each sample in the test data and\nassign the most common cell type label among those neighbors. Finally, we will\nevaluate the predictions using the provided evaluation function and save the\nresults in a `submission.csv` file.", "To solve the single-cell RNA-seq label projection task, I propose using a simple\nRandom Forest classifier. This method will leverage the normalized expression\ndata from both the training and test datasets to predict the cell type labels.\nThe Random Forest model will be trained on the training data, and then\npredictions will be made for the test data. Finally, we will evaluate the\npredictions using the provided evaluation function and save the results in a\n`submission.csv` file.", "To tackle the single-cell RNA-seq label projection task, I propose using a\nSupport Vector Machine (SVM) classifier. This approach will utilize the\nnormalized expression data from the training dataset to train the SVM model,\nwhich will then predict the cell type labels for the test dataset. The\npredictions will be evaluated using the provided evaluation function, and the\nresults will be saved in a `submission.csv` file for grading. This method is\nexpected to perform well given the nature of the data.", "To approach the single-cell RNA-seq label projection task, I will implement a\nGradient Boosting classifier using the normalized expression data from the\ntraining dataset to predict the cell type labels for the test dataset. The\nGradient Boosting model is expected to capture complex patterns in the data and\nprovide better performance compared to simpler models. After training the model\non the training set, I will make predictions on the test set and evaluate the\nresults using the provided evaluation function. The predictions will be saved in\na `submission.csv` file in the `./working` directory.", "To approach the single-cell RNA-seq label projection task, I propose using a\nGradient Boosting Classifier. This model will be trained on the normalized\nexpression data from the training set and will predict the cell type labels for\nthe test set. The predictions will then be evaluated using the provided\nevaluation function, and the results will be saved in a `submission.csv` file\nfor grading. This method is expected to perform well given its ability to\ncapture complex patterns in the data.", "The issue in the previous implementation appears to stem from the model training\ntime exceeding the allotted limit, likely due to the complexity of the Gradient\nBoosting Classifier on a large dataset. To resolve this, I will switch to a\nsimpler model, such as a Random Forest Classifier, which typically has faster\ntraining times while still providing reasonable performance for classification\ntasks. Additionally, I will implement 5-fold cross-validation to ensure a robust\nevaluation of the model's performance. The predicted labels will still be saved\nin the `adata_test` object and exported as a submission file.", "To improve the current solution, I propose implementing a Random Forest\nclassifier instead of the SVM model. Random Forests are generally faster to\ntrain and can handle larger datasets efficiently while providing good predictive\nperformance. Additionally, I will incorporate 5-fold cross-validation to ensure\nthe model's robustness and evaluate its performance more reliably. This change\nshould help in achieving better accuracy and F1 scores while maintaining\nreasonable training times.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest Classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help in achieving better accuracy and F1 scores.", "The previous implementation attempted to perform model fitting and evaluation\nwithout properly handling the data's dimensionality or ensuring that the model\nwas trained effectively. To address this, I will ensure that the input data for\nthe model is correctly reshaped and that the model is trained using the entire\ntraining dataset rather than a split. Additionally, I will utilize cross-\nvalidation to better assess the model's performance before making predictions on\nthe test set. Finally, I will save the predictions in the required submission\nformat.", "The issue in the previous implementation arises from the use of\n`cross_val_predict` which can be time-consuming and may exceed runtime limits,\nespecially with large datasets. Instead, we can simplify the approach by\ndirectly training the model on the entire training dataset and predicting on the\ntest set without cross-validation. This will reduce computation time while still\nallowing us to evaluate the model's performance effectively. Additionally, we\nensure that predictions are saved to the required submission file.", "The issue in the previous implementation arises from the potential size of the\ndataset, which may lead to long training times and possible timeouts. To address\nthis, we can use a more efficient model such as `RandomForestClassifier`, which\ntypically trains faster than `GradientBoostingClassifier`. Additionally, we can\nimplement 5-fold cross-validation to better evaluate the model's performance and\nensure that it generalizes well to unseen data. Finally, we will save the\npredictions in the required `submission.csv` file.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests typically\ntrain faster and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help in achieving better accuracy and F1 scores. Finally, I will\nsave the predictions in the required `submission.csv` file.", "To improve the previous solution, I will implement a Random Forest classifier\ninstead of the Support Vector Machine (SVM). Random Forests typically offer\nfaster training times and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will incorporate\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance. This approach should help achieve better accuracy and F1 scores\nwhile maintaining reasonable training times.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nmore robust evaluation of the model's performance, which should help achieve\nbetter accuracy and F1 scores. Finally, I will save the predictions in the\nrequired `submission.csv` file.", "To improve the previous solution, I will switch from the SVM classifier to a\nRandom Forest classifier. Random Forests typically train faster and can handle\nlarger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach will allow us to leverage the ensemble\nnature of Random Forests to improve the classification results.", "To improve the previous solution, I propose utilizing a Random Forest classifier\ninstead of the SVM model. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nmore robust evaluation of the model's performance, which should help achieve\nbetter accuracy and F1 scores. This approach will allow us to leverage the\nensemble nature of Random Forests to improve the classification results.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) model to a Random Forest classifier. Random Forests typically\ntrain faster and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance.\nThis change is expected to improve the accuracy and F1 scores while maintaining\nreasonable training times.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest Classifier. Random Forests are typically faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores while maintaining reasonable\ntraining times.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores while maintaining reasonable training times.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores while maintaining reasonable training times.", "To enhance the performance of the previous solution, I propose using a Random\nForest classifier instead of the SVM model. Random Forests are generally faster\nto train and can manage larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will incorporate 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to improve classification results.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) model to a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance. This\napproach will leverage the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to improve classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores while\nmaintaining reasonable training times. Finally, I will save the predictions in\nthe required `submission.csv` file.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\noffer faster training times and can handle larger datasets more efficiently\nwhile still providing strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a more robust evaluation of the\nmodel's performance and to help achieve better accuracy and F1 scores. This\nchange should allow the model to leverage the ensemble nature of Random Forests\nfor improved classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance and to help\nachieve better accuracy and F1 scores. This change is expected to leverage the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nmore robust evaluation of the model's performance, which should help achieve\nbetter accuracy and F1 scores. Finally, I will save the predictions in the\nrequired `submission.csv` file.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to enhance classification results.", "To enhance the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically train\nfaster and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to improve classification results.", "The issue in the previous implementation arises from using `cross_val_predict`\nto predict the labels for the test data, which is incorrect as it requires the\nsame number of samples in both the feature and target arrays. Instead, we should\ntrain the Random Forest model on the training data and then directly use the\ntrained model to predict the labels for the test data. This way, we ensure that\nthe model is applied correctly to the test dataset. I will implement this fix by\ntraining the model on `X_train` and `y_train`, and then using it to predict the\nlabels for `X_test`.", "To improve the previous solution, I will switch from using the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. Finally, I will save the\npredictions in the required `submission.csv` file.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\noffer faster training times and can handle larger datasets more efficiently\nwhile still providing strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a more robust evaluation of the\nmodel's performance and to help achieve better accuracy and F1 scores. This\napproach is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance. This approach should help achieve\nbetter accuracy and F1 scores while maintaining reasonable training times.", "The issue in the previous implementation arises from using cross-validation to\npredict on the training set, which results in a number of predictions (3885)\nthat do not match the number of cells in the test set (1115). Instead, we should\ntrain the model on the entire training dataset and then directly predict on the\ntest dataset. This will ensure that the predicted labels for the test dataset\nare correctly assigned to `adata_test.obs[\"celltype_label\"]`. I will implement\nthis fix by training the model on the full training data and then predicting the\nlabels for the test data.", "To improve the previous solution, I propose replacing the Support Vector Machine\n(SVM) with a Random Forest classifier. Random Forests typically train faster and\ncan handle larger datasets more efficiently while still providing strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a more robust evaluation of the model's performance, which should help\nachieve better accuracy and F1 scores.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests generally train faster and can handle\nlarger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach will leverage the ensemble nature of\nRandom Forests for improved classification results.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach will leverage the ensemble nature of\nRandom Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to evaluate the model's performance more robustly. This\napproach is expected to enhance the accuracy and F1 scores by leveraging the\nensemble nature of Random Forests.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests typically offer\nfaster training times and can handle larger datasets more efficiently while\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I propose switching from the SVM classifier to\na Random Forest classifier. Random Forests generally offer faster training times\nand can manage larger datasets more efficiently while still providing strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a robust evaluation of the model's performance and to help achieve\nbetter accuracy and F1 scores. This approach is expected to leverage the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change aims to leverage\nthe ensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach will leverage the ensemble nature of\nRandom Forests to enhance classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance and to help achieve better accuracy and F1 scores. This approach\nleverages the ensemble nature of Random Forests to enhance classification\nresults.", "To improve the previous solution, I will switch from the Support Vector Machine\n(SVM) classifier to a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests to enhance classification\nresults.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests generally provide faster training times\nand can handle larger datasets more efficiently while still delivering strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a robust evaluation of the model's performance and to help achieve\nbetter accuracy and F1 scores. This approach leverages the ensemble nature of\nRandom Forests to enhance classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\ntrain faster and handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach should leverage the ensemble nature of\nRandom Forests to enhance classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests generally provide\nfaster training times and can handle larger datasets more efficiently while\nstill delivering strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ntypically faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\noffer faster training times and can handle larger datasets more efficiently\nwhile still providing strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally offer\nfaster training times and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance. This change\naims to leverage the ensemble nature of Random Forests for improved\nclassification results.", "To improve the current solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To enhance the performance of the previous solution, I will replace the Support\nVector Machine (SVM) classifier with a Random Forest classifier. Random Forests\nare generally faster to train and can handle larger datasets more efficiently\nwhile still providing strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the existing solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests generally provide faster training times\nand can handle larger datasets more efficiently while still delivering strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a robust evaluation of the model's performance and to help achieve\nbetter accuracy and F1 scores. This change is expected to leverage the ensemble\nnature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically have\nfaster training times and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to enhance the classification results significantly.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\noffer faster training times and can handle larger datasets more efficiently\nwhile still providing strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train, can handle larger datasets more efficiently, and provide strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nfor model evaluation, which will help ensure robust performance assessment and\npotentially improve the accuracy and F1 scores. This approach is expected to\nleverage the ensemble nature of Random Forests for enhanced classification\nresults.", "To enhance the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To enhance the performance of the previous solution, I will replace the Support\nVector Machine (SVM) classifier with a Random Forest classifier. Random Forests\ngenerally provide faster training times and can handle larger datasets more\nefficiently while still delivering strong predictive performance. Additionally,\nI will implement 5-fold cross-validation to ensure a robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis change is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach will leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To enhance the existing solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests typically\ntrain faster and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help achieve better accuracy and F1 scores. This approach leverages\nthe ensemble nature of Random Forests to improve classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To enhance the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To enhance the performance of the previous solution, I propose switching from\nthe Support Vector Machine (SVM) classifier to a Random Forest classifier.\nRandom Forests are generally faster to train and can handle larger datasets more\nefficiently while still providing strong predictive performance. Additionally, I\nwill implement 5-fold cross-validation to ensure a robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis adjustment is expected to leverage the ensemble nature of Random Forests\nfor improved classification results.", "To improve the existing solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and handle larger datasets more efficiently while providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to evaluate the model's performance more robustly, which should help\nachieve better accuracy and F1 scores. This change aims to leverage the ensemble\nnature of Random Forests for improved classification results.", "To improve the previous solution, I will switch from the Support Vector Machine\n(SVM) classifier to a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To enhance the performance of the previous solution, I will switch from using a\nSupport Vector Machine (SVM) classifier to a Random Forest classifier. Random\nForests are generally faster to train and can handle larger datasets more\nefficiently while still providing strong predictive performance. Additionally, I\nwill implement 5-fold cross-validation to ensure a robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis approach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests typically train\nfaster and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach will leverage\nthe ensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation for a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "The issue in the previous implementation arises from the use of\n`cross_val_predict`, which produces predictions for the training data rather\nthan the test data. To fix this, we should train the Random Forest classifier on\nthe training data and then use it to predict labels for the test data. This way,\nthe predicted labels can be correctly assigned to\n`adata_test.obs[\"celltype_label\"]`. Additionally, we will ensure that the\nevaluation metrics are printed after making predictions on the test set.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. This change is expected to enhance the\nmodel's accuracy and F1 scores, as Random Forests can better capture the\nunderlying patterns in the data due to their ensemble nature. Additionally, I\nwill implement 5-fold cross-validation for a more robust evaluation of the\nmodel's performance.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally train\nfaster and handle larger datasets more efficiently while still providing strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a more robust evaluation of the model's performance and to help\nachieve better accuracy and F1 scores. This approach is expected to leverage the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change aims to leverage\nthe ensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. I will also implement 5-fold cross-\nvalidation to evaluate the model's performance more robustly, which should help\nachieve better accuracy and F1 scores. This change is expected to leverage the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically\nprovide faster training times and are better suited for handling larger datasets\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a more robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis change is expected to leverage the ensemble nature of Random Forests to\nenhance classification results.", "To improve the previous solution, I will switch from the Support Vector Machine\n(SVM) classifier to a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help achieve better accuracy and F1 scores. This change is expected\nto leverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train, can handle larger datasets more efficiently, and\nprovide strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile delivering strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will switch from the Support Vector Machine\n(SVM) classifier to a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach should leverage\nthe ensemble nature of Random Forests to enhance classification results.", "To improve the performance of the previous solution, I will switch from using a\nSupport Vector Machine (SVM) classifier to a Random Forest classifier. Random\nForests generally provide faster training times and can handle larger datasets\nmore efficiently while still delivering strong predictive performance.\nAdditionally, I will implement 5-fold cross-validation to ensure a robust\nevaluation of the model's performance, which should help achieve better accuracy\nand F1 scores. This change aims to leverage the ensemble nature of Random\nForests for improved classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a more robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis approach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To enhance the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance and potentially improve accuracy\nand F1 scores. This approach leverages the ensemble nature of Random Forests for\nimproved classification results.", "To enhance the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange should leverage the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help achieve better accuracy and F1 scores. This approach leverages\nthe ensemble nature of Random Forests to enhance classification results.", "The issue in the previous implementation arises from the incorrect use of\n`cross_val_predict` for making predictions on the test set. The function is\ndesigned for use with training data only, and using it with `X_test` and\n`y_train` causes a mismatch in the number of samples. To fix this, I will first\ntrain the Random Forest model on the training data and then use the trained\nmodel to predict the labels for the test set directly. This will ensure that the\npredictions are made correctly without any size mismatch.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times, can handle larger datasets more efficiently, and\nare better suited for capturing complex patterns in the data. Additionally, I\nwill implement 5-fold cross-validation to ensure a robust evaluation of the\nmodel's performance, which should help achieve better accuracy and F1 scores.\nThis approach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently, while also\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically train\nfaster and can handle larger datasets more efficiently, while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation for a more robust evaluation of the model's performance. This\napproach is expected to leverage the ensemble nature of Random Forests for\nenhanced classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically train\nfaster and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To enhance the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help achieve better accuracy and F1 scores. This approach is\nexpected to leverage the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently, while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile delivering strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically\nprovide faster training times and can handle larger datasets more efficiently,\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation for a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests to enhance\nclassification results.", "To enhance the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to improve\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently,\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation to ensure a more robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance and to\nhelp achieve better accuracy and F1 scores. This approach leverages the ensemble\nnature of Random Forests to enhance the classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation for a more robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests to enhance\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests are generally faster\nto train and can handle larger datasets more efficiently while still providing\nstrong predictive performance. Additionally, I will implement 5-fold cross-\nvalidation during the training process to ensure a robust evaluation of the\nmodel's performance. This change is expected to leverage the ensemble nature of\nRandom Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests typically\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation during the training process to ensure a robust\nevaluation of the model's performance. This change is expected to leverage the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests are generally faster to train and can\nhandle larger datasets more efficiently while still providing strong predictive\nperformance. Additionally, I will implement 5-fold cross-validation to ensure a\nrobust evaluation of the model's performance, which should help achieve better\naccuracy and F1 scores. This approach leverages the ensemble nature of Random\nForests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I propose switching from the Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nstill providing strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will incorporate 5-fold\ncross-validation to ensure a more robust evaluation of the model's performance,\nwhich should help achieve better accuracy and F1 scores. This change is expected\nto leverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the SVM classifier with a\nRandom Forest classifier. Random Forests generally provide faster training times\nand can handle larger datasets more efficiently while still offering strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a robust evaluation of the model's performance and to help achieve\nbetter accuracy and F1 scores. This approach leverages the ensemble nature of\nRandom Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This change is expected to\nleverage the ensemble nature of Random Forests for improved classification\nresults.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently, which should\nhelp improve performance on the single-cell RNA-seq label projection task.\nAdditionally, I will implement 5-fold cross-validation to provide a more robust\nevaluation of the model's performance, ensuring better accuracy and F1 scores.\nThis approach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) classifier to a Random Forest classifier. Random Forests are\ngenerally faster to train and can handle larger datasets more efficiently while\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation during the training process to ensure a robust evaluation of\nthe model's performance. This approach is expected to leverage the ensemble\nnature of Random Forests for improved classification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests generally provide\nfaster training times and can handle larger datasets more efficiently while\nstill delivering strong predictive performance. Additionally, I will implement\n5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\napproach leverages the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will switch from using a Support Vector\nMachine (SVM) to a Random Forest classifier. Random Forests generally train\nfaster and can handle larger datasets more efficiently while providing strong\npredictive performance. Additionally, I will implement 5-fold cross-validation\nto ensure a more robust evaluation of the model's performance, which should help\nachieve better accuracy and F1 scores. This approach leverages the ensemble\nnature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation to ensure a robust evaluation of the model's\nperformance, which should help achieve better accuracy and F1 scores. This\nchange is expected to leverage the ensemble nature of Random Forests for\nimproved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests generally\nprovide faster training times and can handle larger datasets more efficiently\nwhile still delivering strong predictive performance. Additionally, I will\nimplement 5-fold cross-validation during the training process to ensure a robust\nevaluation of the model's performance. This approach leverages the ensemble\nnature of Random Forests for improved classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are typically\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests to enhance classification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests, which can improve classification results on\ncomplex datasets like single-cell RNA-seq data.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests for improved classification results.", "To improve the previous solution, I will switch from the Support Vector Machine\n(SVM) classifier to a Random Forest classifier. Random Forests generally provide\nfaster training times and can handle larger datasets more efficiently, which\nshould help improve performance on the single-cell RNA-seq label projection\ntask. Additionally, I will implement 5-fold cross-validation during the training\nprocess to ensure a robust evaluation of the model's performance. This approach\nis expected to leverage the ensemble nature of Random Forests for improved\nclassification results.", "To improve the previous solution, I will replace the Support Vector Machine\n(SVM) classifier with a Random Forest classifier. Random Forests are generally\nfaster to train and can handle larger datasets more efficiently while still\nproviding strong predictive performance. Additionally, I will implement 5-fold\ncross-validation to ensure a robust evaluation of the model's performance, which\nshould help achieve better accuracy and F1 scores. This approach leverages the\nensemble nature of Random Forests for improved classification results."], "code": ["import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load data\ntrain_data = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\ntest_data = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare data\nX_train = train_data.layers[\"normalized\"]\ny_train = train_data.obs[\"cell_type\"]\nX_test = test_data.layers[\"normalized\"]\n\n# KNN for label projection\nknn = KNeighborsClassifier(n_neighbors=5)\nknn.fit(X_train, y_train)\n\n# Predict labels for test data\ntest_data.obs[\"celltype_label\"] = knn.predict(X_test)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(test_data)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"cell_type\": test_data.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import pandas as pd\nimport numpy as np\nimport anndata\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split\n\n# Load the data\ntrain_data = anndata.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\ntest_data = anndata.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare the data\nX_train = train_data.layers[\"normalized\"]\ny_train = train_data.obs[\"cell_type\"]\nX_test = test_data.layers[\"normalized\"]\n\n# Train a Random Forest Classifier\nmodel = RandomForestClassifier(n_estimators=100, random_state=42)\nmodel.fit(X_train, y_train)\n\n# Make predictions on the test set\ntest_predictions = model.predict(X_test)\ntest_data.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate the predictions\nmetrics = evaluate_predictions(test_data)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"cell_type\": test_data.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the SVM classifier\nsvm_model = SVC(kernel=\"linear\", probability=True)\nsvm_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = svm_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport anndata as ad\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split\n\n# Load the data\nadata_train = ad.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = ad.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Extract features and labels\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Train-test split for cross-validation\nX_train_split, X_val, y_train_split, y_val = train_test_split(\n    X_train, y_train_encoded, test_size=0.2, random_state=42\n)\n\n# Initialize and fit the Gradient Boosting model\nmodel = GradientBoostingClassifier()\nmodel.fit(X_train_split, y_train_split)\n\n# Validate the model\ny_val_pred = model.predict(X_val)\nval_accuracy = accuracy_score(y_val, y_val_pred)\nval_f1_macro = f1_score(y_val, y_val_pred, average=\"macro\")\nval_f1_micro = f1_score(y_val, y_val_pred, average=\"micro\")\nval_f1_weighted = f1_score(y_val, y_val_pred, average=\"weighted\")\n\n# Print validation metrics\nprint(f\"Validation Accuracy: {val_accuracy}\")\nprint(f\"Validation F1 Macro: {val_f1_macro}\")\nprint(f\"Validation F1 Micro: {val_f1_micro}\")\nprint(f\"Validation F1 Weighted: {val_f1_weighted}\")\n\n# Make predictions on the test set\ny_test_pred = model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = encoder.inverse_transform(y_test_pred)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"celltype_label\": adata_test.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n    return metrics\n\n\n# Evaluate final predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n", "import pandas as pd\nimport numpy as np\nimport anndata\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = anndata.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = anndata.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare the training and test data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Train the Gradient Boosting Classifier\nmodel = GradientBoostingClassifier()\nmodel.fit(X_train, y_train_encoded)\n\n# Make predictions\ny_pred_encoded = model.predict(X_test)\n\n# Decode the predictions back to original labels\ny_pred = encoder.inverse_transform(y_pred_encoded)\n\n# Save predictions to adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate the predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save the submission\nsubmission_df = pd.DataFrame({\"celltype_label\": y_pred})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import pandas as pd\nimport numpy as np\nimport anndata\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = anndata.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = anndata.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare the training and test data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Initialize the model\nmodel = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_encoded = cross_val_predict(model, X_train, y_train_encoded, cv=cv)\n\n# Fit the model on the entire training data\nmodel.fit(X_train, y_train_encoded)\n\n# Make predictions on the test set\ny_pred_encoded_test = model.predict(X_test)\n\n# Decode the predictions back to original labels\ny_pred_test = encoder.inverse_transform(y_pred_encoded_test)\n\n# Save predictions to adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate the predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save the submission\nsubmission_df = pd.DataFrame({\"celltype_label\": y_pred_test})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport anndata as ad\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\n\n# Load the data\nadata_train = ad.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = ad.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Extract features and labels\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Initialize the Gradient Boosting model\nmodel = GradientBoostingClassifier()\n\n# Perform cross-validation and make predictions\ny_train_pred = cross_val_predict(model, X_train, y_train_encoded, cv=5)\n\n# Train the model on the entire training set\nmodel.fit(X_train, y_train_encoded)\n\n# Make predictions on the test set\ny_test_pred = model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = encoder.inverse_transform(y_test_pred)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"celltype_label\": adata_test.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n    return metrics\n\n\n# Evaluate final predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n", "import numpy as np\nimport pandas as pd\nimport anndata as ad\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = ad.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = ad.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Extract features and labels\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Initialize the Gradient Boosting model\nmodel = GradientBoostingClassifier()\n\n# Train the model on the entire training set\nmodel.fit(X_train, y_train_encoded)\n\n# Make predictions on the test set\ny_test_pred = model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = encoder.inverse_transform(y_test_pred)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"celltype_label\": adata_test.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n    return metrics\n\n\n# Evaluate final predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n", "import numpy as np\nimport pandas as pd\nimport anndata as ad\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = ad.read_h5ad(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = ad.read_h5ad(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Extract features and labels\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"]\nX_test = adata_test.layers[\"normalized\"]\n\n# Encode labels\nencoder = LabelEncoder()\ny_train_encoded = encoder.fit_transform(y_train)\n\n# Initialize the Random Forest model\nmodel = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use cross-validation to make predictions on the training set\ny_train_pred = cross_val_predict(model, X_train, y_train_encoded, cv=5)\n\n# Fit the model on the entire training set\nmodel.fit(X_train, y_train_encoded)\n\n# Make predictions on the test set\ny_test_pred = model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = encoder.inverse_transform(y_test_pred)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame({\"celltype_label\": adata_test.obs[\"celltype_label\"]})\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n    return metrics\n\n\n# Evaluate final predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and fit the model\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_train for evaluation\nadata_train.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_train):\n    y_true = adata_train.obs[\"cell_type\"].tolist()\n    y_pred = adata_train.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions on training set\nmetrics = evaluate_predictions(adata_train)\nprint(metrics)\n\n# Train final model and predict on test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with 5-fold cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncross_val_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Set up 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for predictions\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ncross_val_scores = cross_val_score(\n    rf_model, X_train, y_train, cv=cv, scoring=\"accuracy\"\n)\nprint(f\"Cross-validated accuracy: {np.mean(cross_val_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with 5-fold cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training dataset\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring=\"accuracy\")\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncross_val_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cross_val_scores)}\")\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = cross_val_predict(rf_model, X_test, y_train, cv=5)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict labels for the test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with 5-fold cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation to predict on the training data\ny_train_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier on the entire training set\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on the test set\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluate predictions\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and predict on the training data\ny_train_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Train the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use 5-fold cross-validation to get predictions on the training set\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_train_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier using cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation to predict on the training data\nskf = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on the training data\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implementing 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Fit the model on the entire training data for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the entire training set for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the Random Forest model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and predict on training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Train the model on the full training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on the training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training dataset\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on the training data\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to evaluate model performance\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on the test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use cross-validation to predict on training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for predictions\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# 5-fold cross-validation to predict labels\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for training\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform cross-validation predictions on the training data\ny_pred_train = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use cross-validation to predict on training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for robust evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform cross-validation and predict on training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for predictions\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and predict on the training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use 5-fold cross-validation to predict on the training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use cross-validation to predict on the training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier using cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nrf_model.fit(X_train, y_train)  # Fit on the entire training set for final predictions\ny_test_pred = rf_model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and predict\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = rf_model.fit(X_train, y_train).predict(X_test)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use cross_val_predict to get predictions for the training data\ny_pred_train = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for robust evaluation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use Stratified K-Folds for cross-validation\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n\n# Cross-validated predictions\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Fit the model on the entire training set for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=kf, scoring=\"accuracy\")\n\n# Train the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier using cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Define the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict labels for the test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation to predict on training data\ncv = StratifiedKFold(n_splits=5)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on training data\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Train the Random Forest model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data for final predictions\nrf_model.fit(X_train, y_train)\nfinal_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = final_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training set for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to get predictions for the training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred_test = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred_test\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data and predict on the test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set to predict on test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for robust evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on the training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set for final predictions\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set for predictions on test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for robust evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = cross_val_predict(rf_model, X_test, y_train, cv=5)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict the labels for the test set\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict on training data\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data to predict on test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation and predict\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = rf_model.fit(X_train, y_train).predict(X_test)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use StratifiedKFold for cross-validation\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n\n# Perform cross-validation and predict on training data\ny_pred_cv = cross_val_predict(rf_model, X_train, y_train, cv=skf)\n\n# Train the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier using cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split, cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation to predict on the training data\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the Random Forest model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Train the model on the entire training dataset\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import train_test_split, cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for model evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation to predict labels\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Save predictions in adata_test\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions on the test set\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier using cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = rf_model.fit(X_train, y_train).predict(X_test)\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with 5-fold cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_score\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\nprint(f\"Cross-validated accuracy: {np.mean(cv_scores)}\")\n\n# Train the Random Forest classifier\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest Classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Train the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Use 5-fold cross-validation for predictions\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the full training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, StratifiedKFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize the Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Implement 5-fold cross-validation for evaluation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=cv)\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ntest_predictions = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = test_predictions\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncross_val_scores = cross_val_score(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=5)\n\n# Fit the model on the entire training data to predict on test data\nrf_model.fit(X_train, y_train)\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Train the Random Forest classifier with cross-validation\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\ncv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring=\"accuracy\")\n\n# Fit the model on the entire training data\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n", "import numpy as np\nimport pandas as pd\nimport scanpy as sc\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import cross_val_predict, KFold\n\n# Load the data\nadata_train = sc.read(\"./input/cxg_immune_5k_train.h5ad\")\nadata_test = sc.read(\"./input/cxg_immune_5k_test.h5ad\")\n\n# Prepare training data\nX_train = adata_train.layers[\"normalized\"]\ny_train = adata_train.obs[\"cell_type\"].values\n\n# Prepare test data\nX_test = adata_test.layers[\"normalized\"]\n\n# Initialize Random Forest classifier\nrf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n\n# Perform 5-fold cross-validation\nkf = KFold(n_splits=5, shuffle=True, random_state=42)\ny_pred = cross_val_predict(rf_model, X_train, y_train, cv=kf)\n\n# Train the Random Forest model on the entire training set\nrf_model.fit(X_train, y_train)\n\n# Predict on test data\ny_test_pred = rf_model.predict(X_test)\n\n# Save predictions in adata_test\nadata_test.obs[\"celltype_label\"] = y_test_pred\n\n\n# Evaluation function\ndef evaluate_predictions(adata_test):\n    y_true = adata_test.obs[\"cell_type\"].tolist()\n    y_pred = adata_test.obs[\"celltype_label\"].tolist()\n\n    # encode jointly so mapping is consistent\n    all_labels = np.unique(y_true + y_pred)\n    encoder = LabelEncoder().fit(all_labels)\n    y_true_enc = encoder.transform(y_true)\n    y_pred_enc = encoder.transform(y_pred)\n\n    # compute accuracy + three flavours of F1\n    metrics = {\n        \"accuracy\": accuracy_score(y_true_enc, y_pred_enc),\n    }\n    for avg in [\"macro\", \"micro\", \"weighted\"]:\n        metrics[f\"f1_{avg}\"] = f1_score(y_true_enc, y_pred_enc, average=avg)\n\n    return metrics\n\n\n# Evaluate predictions\nmetrics = evaluate_predictions(adata_test)\nprint(metrics)\n\n# Save predictions to submission file\nsubmission_df = pd.DataFrame(\n    {\n        \"cell_id\": adata_test.obs.index,\n        \"celltype_label\": adata_test.obs[\"celltype_label\"],\n    }\n)\nsubmission_df.to_csv(\"./working/submission.csv\", index=False)\n"], "term_out": ["{'accuracy': 0.34977578475336324, 'f1_macro': 0.1988846324088757, 'f1_micro': 0.34977578475336324, 'f1_weighted': 0.30286549177574396}\nExecution time: 6 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 22 seconds seconds (time limit is an hour).", "{'accuracy': 0.831390134529148, 'f1_macro': 0.7483669958339666, 'f1_micro': 0.831390134529148, 'f1_weighted': 0.8236736194714547}\nExecution time: 23 minutes seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 29, in <module>\n    model.fit(X_train_split, y_train_split)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 787, in fit\n    n_stages = self._fit_stages(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 883, in _fit_stages\n    raw_predictions = self._fit_stage(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 495, in _fit_stage\n    _update_terminal_regions(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 255, in _update_terminal_regions\n    update = compute_update(y_, indices, neg_gradient, raw_prediction, k)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 220, in compute_update\n    numerator = np.average(neg_g, weights=sw)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/numpy/lib/function_base.py\", line 549, in average\n    if np.any(scl == 0.0):\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/numpy/core/fromnumeric.py\", line 2412, in any\n    return _wrapreduction(a, np.logical_or, 'any', axis, None, out,\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/numpy/core/fromnumeric.py\", line 86, in _wrapreduction\n    return reduction(axis=axis, out=out, **passkwargs)\nKeyboardInterrupt\nTimeoutError: Execution exceeded the time limit of an hour", "Traceback (most recent call last):\n  File \"runfile.py\", line 24, in <module>\n    model.fit(X_train, y_train_encoded)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 787, in fit\n    n_stages = self._fit_stages(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 883, in _fit_stages\n    raw_predictions = self._fit_stage(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 489, in _fit_stage\n    tree.fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 1404, in fit\n    super()._fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 472, in _fit\n    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)\nKeyboardInterrupt\nTimeoutError: Execution exceeded the time limit of an hour", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 26, in <module>\n    y_train_pred = cross_val_predict(model, X_train, y_train_encoded, cv=5)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py\", line 218, in wrapper\n    return func(*args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py\", line 1234, in cross_val_predict\n    predictions = parallel(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/parallel.py\", line 82, in __call__\n    return super().__call__(iterable_with_config_and_warning_filters)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/joblib/parallel.py\", line 1986, in __call__\n    return output if self.return_generator else list(output)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/joblib/parallel.py\", line 1914, in _get_sequential_output\n    res = func(*args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/parallel.py\", line 147, in __call__\n    return self.function(*args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py\", line 1319, in _fit_and_predict\n    estimator.fit(X_train, y_train, **fit_params)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 787, in fit\n    n_stages = self._fit_stages(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 883, in _fit_stages\n    raw_predictions = self._fit_stage(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 489, in _fit_stage\n    tree.fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 1404, in fit\n    super()._fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 472, in _fit\n    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)\nKeyboardInterrupt\nTimeoutError: Execution exceeded the time limit of an hour", "Traceback (most recent call last):\n  File \"runfile.py\", line 25, in <module>\n    model.fit(X_train, y_train_encoded)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 787, in fit\n    n_stages = self._fit_stages(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 883, in _fit_stages\n    raw_predictions = self._fit_stage(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/ensemble/_gb.py\", line 489, in _fit_stage\n    tree.fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/base.py\", line 1365, in wrapper\n    return fit_method(estimator, *args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 1404, in fit\n    super()._fit(\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/tree/_classes.py\", line 472, in _fit\n    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)\nKeyboardInterrupt\nTimeoutError: Execution exceeded the time limit of an hour", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.697039897039897, 'f1_macro': 0.4561092871169358, 'f1_micro': 0.697039897039897, 'f1_weighted': 0.649915726324256}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 25, in <module>\n    adata_test.obs[\"celltype_label\"] = cross_val_predict(rf_model, X_test, y_train, cv=5)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py\", line 218, in wrapper\n    return func(*args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py\", line 1171, in cross_val_predict\n    X, y = indexable(X, y)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/validation.py\", line 530, in indexable\n    check_consistent_length(*result)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/validation.py\", line 473, in check_consistent_length\n    raise ValueError(\nValueError: Found input variables with inconsistent numbers of samples: [1115, 3885]\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 25, in <module>\n    adata_test.obs[\"celltype_label\"] = y_pred\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 4091, in __setitem__\n    self._set_item(key, value)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 4300, in _set_item\n    value, refs = self._sanitize_column(value)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 5039, in _sanitize_column\n    com.require_length_match(value, self.index)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/common.py\", line 561, in require_length_match\n    raise ValueError(\nValueError: Length of values (3885) does not match length of index (1115)\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 25, in <module>\n    adata_test.obs[\"celltype_label\"] = y_pred\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 4091, in __setitem__\n    self._set_item(key, value)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 4300, in _set_item\n    value, refs = self._sanitize_column(value)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/frame.py\", line 5039, in _sanitize_column\n    com.require_length_match(value, self.index)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/pandas/core/common.py\", line 561, in require_length_match\n    raise ValueError(\nValueError: Length of values (3885) does not match length of index (1115)\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Traceback (most recent call last):\n  File \"runfile.py\", line 25, in <module>\n    adata_test.obs[\"celltype_label\"] = cross_val_predict(rf_model, X_test, y_train, cv=5)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py\", line 218, in wrapper\n    return func(*args, **kwargs)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py\", line 1171, in cross_val_predict\n    X, y = indexable(X, y)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/validation.py\", line 530, in indexable\n    check_consistent_length(*result)\n  File \"/home/aturcan/miniconda3/envs/aide3/lib/python3.10/site-packages/sklearn/utils/validation.py\", line 473, in check_consistent_length\n    raise ValueError(\nValueError: Found input variables with inconsistent numbers of samples: [1115, 3885]\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: 17 seconds seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "Cross-validated accuracy: 0.697039897039897\n{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour).", "{'accuracy': 0.6278026905829597, 'f1_macro': 0.45016369292499353, 'f1_micro': 0.6278026905829597, 'f1_weighted': 0.5881058824270999}\nExecution time: a minute seconds (time limit is an hour)."], "analysis": ["The script successfully executed and provided an accuracy of approximately 34.98% for the label projection task using a K-Nearest Neighbors classifier. The F1 scores varied across different averages, with macro at 19.88%, micro at 34.98%, and weighted at 30.29%. The results suggest that while the model can predict some labels correctly, there is substantial room for improvement in the classification performance.", "The Random Forest Classifier was successfully applied to the single cell RNA-seq label projection task. The model achieved an accuracy of 62.78%, with F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate a moderate performance, suggesting potential room for improvement in model selection or parameter tuning.", "The SVM model trained on single cell RNA-seq data achieved an accuracy of 83.14% and F1-scores of 74.84% (macro), 83.14% (micro), and 82.37% (weighted). These results indicate a strong performance in label projection, suggesting that the model effectively learned to classify cell types based on normalized expression profiles.", "The execution of the training script was interrupted due to a timeout error, specifically a `KeyboardInterrupt` caused by an excessively long computation time during the model fitting stage. This indicates that the model training was overly time-consuming or got stuck in an infinite loop. To fix this issue, consider simplifying the model, reducing the dataset size, or optimizing the model parameters to ensure faster convergence.", "The execution failed due to a timeout error caused by the GradientBoostingClassifier taking too long to fit the model. This issue might be due to the large dataset size or the complexity of the model. A potential fix would be to use a simpler model or reduce the complexity of the GradientBoostingClassifier by adjusting its parameters such as reducing the number of estimators or the depth of the trees.", "The RandomForestClassifier model achieved an accuracy of 62.78% and varying F1 scores across different averaging methods (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These metrics indicate a moderate level of predictive performance, suggesting room for improvement in model training or parameter tuning.", "The output log indicates that the code executed successfully, achieving a cross-validated accuracy of approximately 69.70% on the training data and an accuracy of about 62.78% on the test data. The F1 scores in various averages (macro, micro, weighted) also provide insights into the model's performance across different aspects of the classification task.", "The output from the training script indicates that the model achieved an accuracy of 62.78% on the test dataset. The F1 scores in different averages (macro, micro, and weighted) were also reported, providing a comprehensive evaluation of the model's performance across different aspects of classification accuracy.", "The execution of the code resulted in a TimeoutError, indicating that the GradientBoostingClassifier model took too long to fit the data, exceeding the time limit of an hour. This could be due to the size of the dataset or the complexity of the model. A potential fix could involve using a simpler model, reducing the dataset size, or using dimensionality reduction techniques to make the model training more efficient.", "The execution failed due to a timeout error caused by the GradientBoostingClassifier's training process taking too long. This issue can be addressed by reducing the complexity of the model or by using a simpler model that requires less computational time. Alternatively, increasing the computational resources or using a more efficient algorithm like LightGBM might help resolve the timeout issue.", "The model achieved an accuracy of approximately 62.78%, with F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate that the model has a moderate predictive performance on the test set, suggesting that there is room for improvement in model training or perhaps in feature engineering.", "The script successfully executed and provided a cross-validated accuracy of approximately 69.7% on the training dataset. The final evaluation on the test dataset resulted in an accuracy of 62.8% and F1-scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results suggest that the model has some predictive power, but there might be room for improvement in model performance or data preprocessing.", "The Random Forest model achieved an accuracy of 69.70% and F1 scores across macro, micro, and weighted averages as 45.61%, 69.70%, and 64.99% respectively. These results indicate that the model performs reasonably well in classifying cell types based on single cell RNA-seq data, though there is room for improvement especially in the macro F1 score which considers class imbalance.", "The script successfully executed, performing single cell RNA-seq label projection using a Random Forest classifier. It achieved a cross-validated accuracy of approximately 69.7% on the training data. The final evaluation on the test data yielded an accuracy of 62.78%, with F1 scores (macro, micro, weighted) reflecting varying levels of performance across different averaging methods.", "The model trained using a Random Forest classifier achieved an accuracy of 62.78% on the test dataset. The F1 scores in different averages (macro, micro, and weighted) were also reported, indicating the model's overall performance in handling the classification of cell types based on single-cell RNA-seq data.", "The script successfully trained a Random Forest model to predict cell types based on single-cell RNA-seq data, achieving an accuracy of 0.6278. The F1 scores were reported in macro, micro, and weighted formats, with the highest F1 score being the micro F1 at 0.6278, matching the accuracy.", "The code executed successfully and generated predictions for the single cell RNA-seq label projection task. The Random Forest Classifier model achieved an accuracy of 62.78% and F1 scores across different averages (macro, micro, weighted) reflecting moderate performance. The model's performance indicates room for improvement in future iterations or by tuning the model or preprocessing steps.", "The Random Forest model achieved a cross-validated accuracy of approximately 69.70% on the training data. When applied to the test data, the model achieved an accuracy of 62.78%, with F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively. These results indicate that the model has moderate predictive power, with room for improvement in both accuracy and F1 scores across different averaging methods.", "The script successfully trained a Random Forest model to predict cell types from single-cell RNA-seq data. The cross-validated accuracy on the training set was approximately 69.70%. Upon evaluating the test set, the model achieved an accuracy of 62.78%, a macro F1 score of 45.02%, a micro F1 score of 62.78%, and a weighted F1 score of 58.81%. These results indicate moderate predictive performance.", "The Random Forest model achieved a cross-validated accuracy of approximately 69.7% on the training data and an accuracy of about 62.8% on the test data. The F1 scores in different averaging methods (macro, micro, weighted) were also reported, showing a reasonable performance but indicating potential for improvement in model generalization.", "The script successfully executed and provided results for the single cell RNA-seq label projection using a RandomForestClassifier. The achieved accuracy was 62.78%, with F1-scores in macro, micro, and weighted formats being 45.02%, 62.78%, and 58.81% respectively. These results indicate a decent classification performance, especially considering the complexity of single-cell RNA sequencing data.", "The implemented model using Random Forest achieved an accuracy of 62.78% with F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These metrics indicate a moderate performance in label projection on the test dataset. The model could potentially be improved by tuning hyperparameters, using feature engineering, or employing more complex models.", "The script successfully executed, performing single cell RNA-seq label projection using a RandomForestClassifier. The cross-validated accuracy on the training data is approximately 69.7%. When evaluated on the test data, the overall accuracy is about 62.8% with varying F1 scores across different averaging methods. This indicates a moderate performance of the model in generalizing to new data.", "The model achieved an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881) on the test data. These results indicate that the model has a moderate predictive performance, with potential room for improvement, especially in balancing the prediction across various cell types as reflected by the macro F1 score.", "The script successfully projected cell type labels from a training dataset to a test dataset using a Random Forest classifier with cross-validation. The accuracy achieved is 62.78%, with F1 scores across different averages ranging from 45.02% to 58.81%. These results indicate moderate prediction performance.", "The Random Forest model achieved a cross-validated accuracy of approximately 69.7% on the training data. After fitting the model to the entire training set, the evaluation on the test set yielded an accuracy of about 62.8% with F1-scores (macro, micro, and weighted) of 45.02%, 62.78%, and 58.81% respectively. These results suggest moderate performance, indicating room for improvement in model selection or parameter tuning.", "The execution output indicates that the Random Forest model achieved a cross-validated accuracy of approximately 69.70% on the training data. Upon evaluating the model on the test data, it achieved an accuracy of 62.78% and F1 scores (macro, micro, and weighted) of 45.02%, 62.78%, and 58.81% respectively. These results suggest that the model has moderate predictive performance, which might be improved with parameter tuning or alternative modeling approaches.", "The code executed successfully, training a RandomForest model for cell type prediction based on single cell RNA-seq data. The model achieved an accuracy of 0.6278 and F1 scores in different averages ranged from 0.4502 to 0.5881, indicating a moderate performance.", "The script successfully trained a Random Forest classifier on single-cell RNA-seq data and evaluated it on a test set. The cross-validated accuracy on the training set was approximately 69.7%, while the final accuracy on the test set was 62.78%. The F1 scores (macro, micro, and weighted) were also reported, providing a comprehensive view of the model's performance across different aspects of classification accuracy.", "The code fails due to a mismatch in the number of samples between `X_test` and `y_train` when using `cross_val_predict` for the test set. This issue arises because `cross_val_predict` expects the same number of samples in `X` and `y`. A correct approach would be to fit the model on the training data and then use the `predict` method on the test data.", "The code successfully trained a Random Forest model to predict cell types based on single cell RNA-seq data. The model achieved an accuracy of 62.78%, with F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate a decent performance, though there may be room for improvement in the model's ability to generalize across different cell types.", "The Random Forest model trained on the normalized single-cell RNA-seq data achieved an accuracy of 62.78% and F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These results suggest that the model is moderately effective in classifying cell types, but there is room for improvement, possibly through model tuning or using more sophisticated algorithms.", "The execution of the Random Forest model for single cell RNA-seq label projection yielded an accuracy of 62.78% and F1 scores in various averages (macro, micro, weighted). The results indicate that while the model is moderately effective, there may be room for improvement in classification performance, perhaps by tuning model parameters or experimenting with different machine learning algorithms.", "The script successfully trained a Random Forest classifier and evaluated it on the test dataset using accuracy and F1 scores. The resulting accuracy of the model on the test set is approximately 62.78%, with F1 scores varying across different averaging methods, indicating moderate performance in label projection for single-cell RNA-seq data.", "The code execution failed due to a mismatch in the length of predictions and the index of the test dataset. The error arises when trying to assign the predicted labels to `adata_test.obs['celltype_label']`. This issue can be fixed by ensuring that the classifier is trained and predicted on correctly aligned data sizes, or by correctly indexing the predictions to match the test data.", "The script successfully trained a RandomForest model to predict cell types from single-cell RNA-seq data, achieving an accuracy of 62.78% and F1 scores in various averages (macro, micro, weighted). These results indicate moderate predictive performance, suggesting room for model improvement or parameter tuning.", "The code successfully trained a Random Forest classifier to predict cell types from single cell RNA-seq data and evaluated its performance. The accuracy achieved was 62.78%, with a macro F1 score of 45.02%, micro F1 score of 62.78% and weighted F1 score of 58.81%. These results suggest moderate effectiveness of the model, with potential room for improvement.", "The implemented Random Forest model for single cell RNA-seq label projection achieved an accuracy of 62.78% and F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These results indicate that the model has a decent performance, but there might be room for improvement, possibly by tuning hyperparameters or using a different modeling approach.", "The Random Forest model achieved an accuracy of 0.6278 and F1 scores of 0.4502 (macro), 0.6278 (micro), and 0.5881 (weighted) on the test dataset. These results indicate a moderate level of prediction performance, suggesting that the model could benefit from further optimization or different modeling approaches to improve classification accuracy and F1 scores.", "The Random Forest classifier achieved an accuracy of 62.78% and F1 scores of 0.450 (macro), 0.628 (micro), and 0.588 (weighted) on the test dataset. These results indicate that the model has reasonable predictive power but could potentially benefit from further optimization or the use of more complex models to improve classification performance.", "The script successfully trained a Random Forest model to predict cell types in single cell RNA-seq data, achieving an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881). These results indicate a reasonable model performance, with potential room for improvement in class balance handling (as indicated by the macro F1 score).", "The script successfully executed and produced a set of evaluation metrics for the single cell RNA-seq label projection task using a RandomForestClassifier. The accuracy achieved was 0.6278, with F1 scores in macro, micro, and weighted averaging at 0.4502, 0.6278, and 0.5881 respectively. These results indicate a moderate level of performance in label prediction.", "The code successfully projected the cell types from the training dataset to the test dataset using a Random Forest Classifier. The accuracy achieved is 0.6278, with F1-scores for macro, micro, and weighted being 0.4502, 0.6278, and 0.5881 respectively. These metrics indicate a moderate level of performance, suggesting potential room for model tuning or exploring alternative classification methods to improve the predictions.", "The implemented Random Forest classifier for single cell RNA-seq label projection achieved an accuracy of 62.78%. The F1 scores across different averaging methods (macro, micro, weighted) varied, indicating room for improvement in model performance, particularly in handling class imbalance. The overall results suggest a functional but potentially improvable model.", "The execution of the single cell RNA-seq label projection using a Random Forest classifier was successful. The model achieved an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881) on the test dataset. These results indicate a moderate level of performance, suggesting room for improvement in the model or feature engineering.", "The code successfully trained a Random Forest model to predict cell types based on single cell RNA-seq data and evaluated the predictions using accuracy and F1 scores. The accuracy achieved was 62.78%, with F1 scores varying across different averaging methods. The results suggest that the model performs moderately well, with room for improvement in classification accuracy and balance across classes.", "The script successfully executed the task of projecting cell type labels using a Random Forest model on single-cell RNA-seq data, achieving an accuracy of 62.78%. The F1 scores across different averages (macro, micro, weighted) were also computed, providing a comprehensive evaluation of the model's performance.", "The script successfully executed the task of projecting cell type labels using a RandomForestClassifier on single cell RNA-seq data. The evaluation metrics indicate an accuracy of 62.78%, with F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results suggest moderate prediction performance, highlighting potential areas for model improvement or parameter tuning.", "The code successfully trained a Random Forest classifier on single-cell RNA-seq data, achieving an accuracy of 0.6278. The F1 scores for macro, micro, and weighted averages were 0.4502, 0.6278, and 0.5881 respectively, indicating moderate predictive performance.", "The implementation of the Random Forest classifier for single cell RNA-seq label projection achieved an accuracy of 62.78%. The F1 scores across different averages (macro, micro, and weighted) also indicate a reasonable performance, with the highest being the micro F1 score at 62.78%. This suggests that the model is moderately effective in predicting cell types based on normalized RNA-seq data.", "The code successfully executed and provided accuracy and F1 scores for the single cell RNA-seq label projection using a Random Forest model. The accuracy achieved is 62.78%, with F1 scores varying across different averages. This indicates a moderate performance of the model in classifying cell types.", "The script successfully executed and achieved an accuracy of 62.78% with F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%) on the test dataset. This indicates a moderate level of prediction performance, suggesting that the model is somewhat effective in classifying cell types based on single cell RNA-seq data, though there is room for improvement in classification accuracy and balance across classes.", "The implementation successfully trained a RandomForestClassifier to project cell type labels from single cell RNA-seq data, achieving an accuracy of 0.6278. The F1 scores for macro, micro, and weighted averages are 0.4502, 0.6278, and 0.5881 respectively, indicating a moderate performance.", "The execution of the Random Forest classification model for single cell RNA-seq label projection resulted in an accuracy of 0.6278, with F1 scores (macro, micro, weighted) of 0.4502, 0.6278, and 0.5881 respectively. These results indicate a moderate performance of the model in classifying cell types based on normalized RNA-seq data.", "The script successfully implemented a Random Forest classifier to predict cell types in single-cell RNA-seq data, achieving an accuracy of 0.6278. The F1 scores in different averages (macro, micro, weighted) provide additional insights into the model's performance across various cell types, indicating a balanced but moderate performance.", "The model achieved an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) on the test data. The results suggest that the Random Forest classifier has a moderate performance, with room for improvement in class balance handling, as indicated by the lower macro F1 score.", "The script successfully executed and projected cell type labels from the training dataset to the test dataset using a Random Forest classifier. The evaluation metrics indicate an accuracy of 62.78%, with varying F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). This reflects a decent classification performance, suggesting the model could potentially be improved with further tuning or alternative modeling approaches.", "The script successfully trained a Random Forest model on single-cell RNA-seq data and evaluated its performance on a test set. The achieved accuracy of the model is 62.78%, with F1 scores varying across different averaging methods. This indicates a moderate level of prediction performance, suggesting room for model improvement or parameter tuning.", "The execution of the Random Forest classifier for single cell RNA-seq label projection was successful. The model achieved an accuracy of 0.6278 and various F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881), indicating a moderate level of prediction performance.", "The script successfully trained a RandomForestClassifier on single-cell RNA-seq data and predicted cell types for the test dataset. The accuracy achieved is 62.78%, with F1 scores varying across different averaging methods: macro F1 is 45.02%, micro F1 is 62.78%, and weighted F1 is 58.81%. These results indicate moderate performance, suggesting room for improvement in model tuning or choice.", "The script successfully trained a Random Forest model on single-cell RNA-seq data, predicting cell types for a test dataset. The accuracy achieved is 62.78%, with F1 scores in macro, micro, and weighted categories being 45.02%, 62.78%, and 58.81% respectively. These results indicate a moderate performance, suggesting potential room for model tuning or exploring different algorithms for better accuracy.", "The script successfully executed and produced the following metrics: accuracy of 62.78%, F1-macro of 45.02%, F1-micro of 62.78%, and F1-weighted of 58.81%. These results indicate the model's performance in predicting cell types from single-cell RNA-seq data, with a reasonable accuracy considering the complexity of the task.", "The implemented Random Forest model achieved an accuracy of 0.6278 and F1 scores across different averages (macro, micro, weighted) ranging from 0.4502 to 0.5881. The model was successfully trained and evaluated on the single-cell RNA-seq data, projecting labels from the training dataset to the test dataset effectively.", "The execution of the Random Forest classifier for single cell RNA-seq label projection on the test dataset resulted in an accuracy of 62.78% and F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively. These results indicate a moderate level of predictive performance, suggesting room for improvement possibly through model tuning or exploring alternative classification algorithms.", "The script successfully executed and computed the classification metrics for the single cell RNA-seq label projection using a RandomForestClassifier. The accuracy achieved is 0.6278, with F1-scores (macro, micro, weighted) of 0.4502, 0.6278, and 0.5881 respectively, indicating a moderate performance.", "The execution of the Random Forest classifier for single cell RNA-seq label projection was successful. The model achieved an accuracy of 62.78% on the test data. The F1-scores were reported as 45.02% (macro), 62.78% (micro), and 58.81% (weighted), indicating a moderate performance across different cell types.", "The script successfully executed and produced the metrics for the Random Forest classifier's performance on single cell RNA-seq label projection. The accuracy obtained is 0.6278, along with F1 scores for macro, micro, and weighted averaging at 0.4502, 0.6278, and 0.5881 respectively. These results indicate moderate predictive performance.", "The script successfully projected the cell type labels from the training dataset to the test dataset using a Random Forest classifier. The accuracy of the predictions on the test set is 62.78%, with an F1 score (macro) of 45.02%, F1 score (micro) of 62.78%, and F1 score (weighted) of 58.81%. These results indicate a moderate performance of the model, suggesting that further optimization or alternative models might be explored to improve the classification accuracy.", "The script successfully applied a Random Forest classifier to predict cell type labels on single-cell RNA-seq data, achieving an accuracy of 62.78%. The F1 scores reported in macro, micro, and weighted categories show varied performance, indicating some class imbalances or prediction challenges for certain cell types.", "The Random Forest model was used to project cell type labels from training to test data in single-cell RNA-seq analysis. The model achieved an accuracy of 0.6278 and F1 scores across different averages (macro: 0.4502, micro: 0.6278, weighted: 0.5881). These results suggest moderate performance, indicating room for improvement in model training or parameter tuning.", "The Random Forest model achieved an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) in predicting cell types from single-cell RNA-seq data. These results indicate a moderate level of prediction performance, suggesting room for model optimization or exploration of alternative modeling approaches.", "The script successfully performed label projection for single cell RNA-seq data using a RandomForestClassifier. The model achieved an accuracy of approximately 62.78% and F1-scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) on the test dataset. These results suggest moderate effectiveness in label prediction, with room for improvement in model performance or feature engineering.", "The execution of the Random Forest model for single-cell RNA-seq label projection yielded an accuracy of 62.78%, with F1 scores varying across different averages. The model appears to perform decently, but there's potential for improvement in class balance handling given the macro F1 score of 45.01%.", "The Random Forest classifier achieved an accuracy of 62.78% on the test dataset. The F1 scores were 45.02% (macro), 62.78% (micro), and 58.81% (weighted), indicating moderate performance across different averaging methods. This suggests that while the classifier is reasonably effective, there is room for improvement, possibly through model tuning or using more complex models.", "The script successfully trained a Random Forest model to predict cell types based on single-cell RNA-seq data, achieving an accuracy of 62.78% and F1 scores in various metrics. The model's best performance was in accuracy, while the F1 scores indicate room for improvement in prediction precision and recall across different cell types.", "The Random Forest classifier was used to project labels from a training dataset to a test dataset in a single-cell RNA-seq analysis. The model achieved an accuracy of 0.628 and F1 scores of 0.450 (macro), 0.628 (micro), and 0.588 (weighted). These results indicate moderate performance, suggesting room for improvement in model selection or parameter tuning.", "The implemented RandomForestClassifier achieved an accuracy of 62.78% and F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%) on the test dataset. These results indicate that the classifier was able to project cell types with moderate effectiveness, suggesting room for improvement in model performance or feature engineering.", "The script successfully performed single cell RNA-seq label projection using a RandomForestClassifier. The achieved accuracy is 0.6278, with F1 scores (macro, micro, weighted) of 0.4502, 0.6278, and 0.5881 respectively. These results indicate a moderate performance of the model in classifying cell types.", "The script successfully executed the task of label projection using a RandomForestClassifier on single cell RNA-seq data. The accuracy achieved was 0.6278, with F1 scores (macro, micro, weighted) being 0.4502, 0.6278, and 0.5881 respectively. These results indicate a moderate level of prediction accuracy, suggesting room for potential improvement in model performance or data preprocessing.", "The Random Forest model achieved an accuracy of 62.78% on the test dataset, with F1 scores for macro, micro, and weighted averages being 45.02%, 62.78%, and 58.81% respectively. This indicates a moderate performance in classifying cell types, suggesting potential areas for improvement in model training or parameter tuning.", "The script successfully trained a Random Forest model on single-cell RNA-seq data and predicted cell types on a test dataset. The model achieved an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881). These results indicate moderate performance, suggesting room for improvement in model training or parameter tuning.", "There is a bug in the code where the predicted labels (y_pred) generated from cross-validation on the training set do not match the length of the test set index when attempting to assign to adata_test.obs['celltype_label']. To fix this, ensure that the model is trained on the entire training set and then used to predict labels for the test set, rather than using cross-validation predictions directly.", "The code successfully executed and produced a set of performance metrics for the cell type prediction model on test data. The model achieved an accuracy of 62.78%, with a macro F1-score of 45.02%, a micro F1-score of 62.78%, and a weighted F1-score of 58.81%. These results indicate moderate performance, suggesting room for improvement in the model or feature engineering.", "The script successfully executed the task of label projection using a Random Forest classifier, achieving an accuracy of 0.6278 on the test dataset. The F1 scores in various averages (macro, micro, weighted) were also reported, indicating a reasonable performance considering the complexity of single-cell RNA-seq data classification.", "The script successfully executed the task of projecting cell type labels from training data to test data using a RandomForestClassifier. The model achieved an accuracy of 62.78%, with a macro F1 score of 45.02%, a micro F1 score of 62.78%, and a weighted F1 score of 58.81% on the test data. These results indicate moderate performance, suggesting room for model improvement or parameter tuning.", "The Random Forest classifier achieved a reasonable performance with an accuracy of 62.78% and F1 scores across different averages showing varying levels of effectiveness. The macro F1 score was notably lower, indicating some imbalance in performance across different classes.", "The script successfully performed the task of predicting cell types using a RandomForestClassifier on single-cell RNA-seq data. The accuracy achieved was 62.78%, with F1 scores (macro, micro, and weighted) of 45.02%, 62.78%, and 58.81% respectively. These results indicate a decent classification performance, suggesting the model was able to capture relevant patterns in the data, but there might be room for improvement.", "The code successfully executed and produced valid accuracy and F1 scores for the label projection task using a Random Forest classifier on single cell RNA-seq data. The achieved accuracy is 0.6278, with F1 scores for macro, micro, and weighted averages being 0.4502, 0.6278, and 0.5881 respectively.", "The code successfully executed and provided metrics for the accuracy and F1 scores of the model's predictions on the test dataset. The accuracy achieved is 0.6278, with F1 scores varying across different averaging methods. The results indicate a moderate level of prediction performance, suggesting room for improvement in the model or data preprocessing steps.", "The implemented Random Forest classifier achieved an accuracy of 62.78% on the test dataset. The F1-scores reported are 0.450 for macro, 0.628 for micro, and 0.588 for weighted, indicating a moderate performance in classifying single cell RNA-seq data. The model might benefit from further parameter tuning or using more sophisticated models to improve these metrics.", "The Random Forest classifier applied to the single cell RNA-seq data achieved an accuracy of approximately 62.78%, with F1 scores (macro, micro, and weighted) of 0.450, 0.628, and 0.588 respectively. These results indicate the model's moderate effectiveness in classifying cell types based on normalized RNA-seq data, suggesting room for improvement in model performance or feature engineering.", "The script successfully trained a RandomForestClassifier on single-cell RNA-seq data and used it to predict cell types in the test dataset. The accuracy achieved is 62.78%, with F1 scores for macro, micro, and weighted averaging at 45.02%, 62.78%, and 58.81% respectively. These results indicate moderate predictive performance.", "The Random Forest model trained on normalized single-cell RNA-seq data achieved a classification accuracy of 62.78% on the test data. The F1-scores in macro, micro, and weighted formats were 45.02%, 62.78%, and 58.81% respectively, indicating moderate performance. These results suggest room for improvement in model training or parameter tuning.", "The script successfully executed and provided the evaluation metrics for the model's predictions on single cell RNA-seq label projection. The accuracy achieved is 0.6278, with F1 scores varying across different averaging methods: macro (0.4502), micro (0.6278), and weighted (0.5881). These results indicate a moderate performance of the Random Forest Classifier in predicting cell types.", "The output shows that the model achieved an accuracy of 62.78% and F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%) on the test dataset. These results indicate a moderate level of performance in predicting cell types based on single-cell RNA-seq data using a Random Forest classifier.", "The model achieved an accuracy of 62.78% and F1 scores across different averages (macro, micro, weighted) ranging from 45.02% to 62.78%. This indicates a moderate performance in predicting the cell types based on normalized single-cell RNA-seq data using a Random Forest Classifier.", "The implemented method for single cell RNA-seq label projection using a Random Forest classifier achieved an accuracy of 0.6278. The F1 scores across different averages (macro, micro, weighted) were also computed, indicating a reasonable performance of the model in classifying cell types based on normalized gene expression data.", "The script successfully executed and generated predictions for single cell RNA-seq label projection using a RandomForestClassifier. The model achieved an accuracy of 0.6278 and F1 scores of 0.4502 (macro), 0.6278 (micro), and 0.5881 (weighted) on the test data. These metrics indicate that while the model is reasonably accurate, there's room for improvement, particularly in handling class imbalance as reflected in the macro F1 score.", "The execution of the Random Forest model for single cell RNA-seq label projection resulted in an accuracy of 62.78%, with F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These results indicate a moderate performance in predicting cell types, suggesting room for improvement in model tuning or feature engineering.", "The script successfully executed and provided metrics for the model's performance on the test dataset. The Random Forest classifier achieved an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881), indicating a moderate level of prediction accuracy. The model's performance suggests room for improvement, possibly by tuning model parameters or using more sophisticated modeling techniques.", "The evaluation of the Random Forest model on the single cell RNA-seq data resulted in an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate that the model has a moderate performance in predicting cell types based on RNA-seq data.", "The execution of the Random Forest classifier on the single cell RNA-seq data resulted in an accuracy of 62.78%, with F1 scores as follows: macro - 45.02%, micro - 62.78%, and weighted - 58.81%. These results indicate a moderate performance on the task of label projection, suggesting room for improvement in model tuning or feature engineering.", "The script successfully implemented a Random Forest classifier to project cell type labels from training to test data in single-cell RNA-seq analysis. The accuracy achieved is 62.78%, with macro, micro, and weighted F1 scores of 45.02%, 62.78%, and 58.81% respectively, indicating moderate performance.", "The script successfully trained a Random Forest model to project cell type labels from training to test data in single-cell RNA-seq analysis. The model achieved an accuracy of 62.78% and F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively, indicating a moderate performance in label projection.", "The script successfully performed the task of predicting cell type labels using a Random Forest classifier on single cell RNA-seq data. The model achieved an accuracy of 0.6278 and various F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881), indicating a moderate performance. The predictions were saved and evaluated using the specified metrics.", "The code execution failed due to a mismatch in the number of samples between X_test and y_train when using the cross_val_predict function for the test data. The error occurred because y_train, which has a length of 3885, does not match the number of samples in X_test, which has 1115 samples. To fix this issue, the RandomForestClassifier should be fit using X_train and y_train, and then predictions should be made on X_test using the fitted model's predict method.", "The code successfully trained a Random Forest model on single cell RNA-seq data and predicted cell types for a test dataset. The reported accuracy was 0.6278, with F1 scores varying across different averages (macro, micro, weighted). The accuracy and F1 scores indicate moderate performance, suggesting potential room for model improvement or parameter tuning.", "The Random Forest model achieved an accuracy of 62.78% with F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) on the test data. The model was trained using a 5-fold cross-validation on the training set and then used to predict cell types on the test set. The results indicate moderate performance, suggesting potential room for improvement in model tuning or exploring alternative modeling approaches.", "The script successfully executed, predicting cell types for single cell RNA-seq data using a Random Forest model. The model achieved an accuracy of 0.6278 and F1 scores as follows: macro 0.4502, micro 0.6278, weighted 0.5881. These results indicate moderate predictive performance, suggesting potential areas for model improvement or parameter tuning.", "The code executed successfully, producing an accuracy of 62.78% and F1 scores across different averages (macro, micro, and weighted). These metrics suggest that the model has a moderate performance in classifying cell types based on single cell RNA-seq data. The model could potentially be improved by tuning hyperparameters or using more sophisticated machine learning models.", "The script successfully executed the task of projecting cell type labels using a Random Forest classifier on single-cell RNA-seq data. The achieved accuracy is 62.78%, with F1 scores across different averages ranging from 45.02% to 58.81%. This indicates a moderate level of prediction performance, suggesting room for model improvement or parameter tuning.", "The script successfully executed and produced accuracy and F1 scores for the cell type prediction on single cell RNA-seq data. It achieved an accuracy of 62.78% and various F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These results indicate that the model has a reasonable predictive performance, though there might be room for improvement especially in balancing the class-specific performance as indicated by the macro F1 score.", "The code successfully projected cell type labels from the training dataset to the test dataset using a Random Forest model with cross-validation. The resulting accuracy of the model on the test data is 0.6278, with F1 scores for macro, micro, and weighted being 0.4502, 0.6278, and 0.5881 respectively. These results suggest moderate performance, indicating room for potential improvement in model accuracy and F1 scores.", "The Random Forest classifier achieved an accuracy of 62.78% and F1 scores across different averages (macro, micro, weighted) ranging from 45.02% to 62.78%. These results indicate a moderate performance in predicting the cell type labels on the test dataset, suggesting room for model improvement or parameter tuning.", "The Random Forest model trained on single cell RNA-seq data achieved an accuracy of 0.628 and F1 scores of 0.450 (macro), 0.628 (micro), and 0.588 (weighted). These results indicate moderate predictive performance, highlighting room for potential improvements in model tuning or feature engineering.", "The model achieved an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) on the test dataset. These results indicate a moderate performance, suggesting room for model optimization and tuning to improve prediction accuracy and consistency across cell types.", "The Random Forest classifier successfully projected cell type labels from the training dataset to the test dataset. The accuracy achieved is 62.78%, with the F1-scores for macro, micro, and weighted averaging at 45.02%, 62.78%, and 58.81% respectively. These results indicate a moderate level of performance in label projection, suggesting room for model improvement or parameter tuning.", "The script successfully executed, implementing a Random Forest model for label projection in single-cell RNA-seq data. The model achieved an accuracy of 0.6278, with F1 scores across different averages ranging from 0.4502 to 0.6278. These results indicate a reasonable performance, though there might be room for improvement in model tuning or by exploring alternative classification algorithms.", "The implementation successfully projected cell type labels from the training dataset to the test dataset using a RandomForestClassifier. The evaluation metrics indicate an accuracy of 62.78%, with F1 scores varying across macro, micro, and weighted averages. This suggests moderate performance with room for improvement in classification accuracy.", "The implementation of the Random Forest classifier for single cell RNA-seq label projection achieved an accuracy of approximately 62.78%. The F1 scores across different averages (macro, micro, weighted) ranged from 45.02% to 62.78%, indicating a moderate performance. The cross-validation during training suggests a robust model fitting, and the classifier was able to generalize reasonably well to the test data.", "The script successfully executed and generated predictions for cell type labels based on single-cell RNA-seq data. It employed a Random Forest classifier with 5-fold cross-validation on the training dataset and made predictions on the test dataset. The evaluation metrics reported include an accuracy of 62.78%, a macro F1-score of 45.02%, a micro F1-score of 62.78%, and a weighted F1-score of 58.81%.", "The Random Forest model was successfully trained and tested on single cell RNA-seq data, achieving an accuracy of 62.78% and F1 scores across macro, micro, and weighted averages. The results indicate a decent classification performance, though there might be room for improvement in model tuning or trying different classification algorithms.", "The script successfully trained a RandomForest model on normalized single-cell RNA-seq data and predicted cell types on test data. It achieved an accuracy of 62.78%, with F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively, indicating a moderate performance in label projection.", "The Random Forest classifier successfully projected cell type labels from training to testing data, achieving an accuracy of 62.78%. The F1 scores in different averages (macro, micro, weighted) provide additional insights into the model's performance across various cell types, indicating a reasonable but not excellent classification performance.", "The script successfully performed label projection on the test data using a Random Forest classifier trained on normalized single cell RNA-seq data. The evaluation metrics show an accuracy of 62.78%, with F1 scores varying across different averages, indicating the model's effectiveness and limitations in classification accuracy across cell types.", "The script successfully executed and provided the evaluation metrics for the single cell RNA-seq label projection using a Random Forest classifier. The accuracy achieved is 0.6278, with F1 scores across different averages varying between 0.4502 and 0.6278. These results indicate a moderate level of prediction performance, suggesting potential room for model improvement or parameter tuning.", "The script successfully executed and provided metrics for the label projection task using a Random Forest classifier on single cell RNA-seq data. The accuracy achieved is 62.78%, with an F1 score (macro) of 45.02%, F1 score (micro) of 62.78%, and F1 score (weighted) of 58.81%. These results indicate a decent classification performance, though there might be room for improvement, possibly by tuning the model or using different classification algorithms.", "The script has successfully projected the cell type labels from the training dataset to the test dataset using a Random Forest Classifier. The model achieved an accuracy of 62.78% and F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%). These metrics suggest that the model performs moderately well, but there might be room for improvement in label projection accuracy.", "The RandomForestClassifier model achieved an accuracy of 62.78% on the test dataset. The F1 scores across different averages (macro, micro, weighted) were also reported, indicating moderate performance. The model was trained using 5-fold cross-validation, and predictions were saved for submission.", "The script has successfully executed and predicted cell types for the test dataset using a RandomForestClassifier trained on normalized single-cell RNA-seq data. The achieved accuracy is 0.6278, and F1 scores are reported in macro, micro, and weighted variants, indicating a decent but potentially improvable model performance.", "The model achieved an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted) on the test set. These results indicate that the model can reasonably predict cell types from single-cell RNA-seq data, though there's room for improvement in terms of both accuracy and F1 scores.", "The model achieved an accuracy of 62.78% and F1 scores as follows: macro-0.450, micro-0.628, and weighted-0.588. These results indicate a moderate level of performance, suggesting that while the model can predict some cell types accurately, there's room for improvement, especially in balancing class-wise performance as indicated by the macro F1 score.", "The script successfully executed the task of projecting cell type labels from training data to test data using a Random Forest classifier. The evaluation metrics indicate an accuracy of 62.78% and F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively. These results suggest moderate predictive performance, with room for improvement in classification accuracy and balance across classes.", "The script successfully executed the task of cell type prediction using a Random Forest classifier on normalized single cell RNA-seq data. The accuracy achieved was approximately 62.78%, with F1 scores in macro, micro, and weighted categories being 45.02%, 62.78%, and 58.81% respectively. This indicates a reasonable performance but suggests there might be room for improvement in model training or parameter tuning.", "The code executed successfully and produced an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881) on the test set. The accuracy and F1 scores suggest moderate performance, indicating room for model improvement or parameter tuning.", "The script successfully trained a Random Forest model to predict cell types from single-cell RNA-seq data and evaluated the predictions. The achieved accuracy was 62.78%, with macro, micro, and weighted F1 scores of 45.02%, 62.78%, and 58.81% respectively. These results indicate a reasonable performance, though there might be room for improvement in model tuning or feature engineering.", "The script successfully executed and produced an accuracy of 0.6278 and F1 scores (macro: 0.4502, micro: 0.6278, weighted: 0.5881) on the test dataset. These results indicate a decent model performance, especially considering the complexity of single-cell RNA-seq data classification.", "The Random Forest model achieved an accuracy of 62.78% on the test dataset, which is a decent performance given the complexity of single-cell RNA-seq data. The F1 scores in various averages (macro, micro, weighted) also provide insights into the model's ability to handle imbalanced classes, with the weighted F1 score being slightly lower than the micro F1 score, indicating some class imbalances were managed but could be improved.", "The code successfully predicted the cell type labels for the test dataset using a RandomForestClassifier trained on normalized single-cell RNA-seq data. The reported metrics indicate an accuracy of 62.78%, with a macro F1-score of 45.02%, micro F1-score of 62.78%, and weighted F1-score of 58.81%. These results suggest moderate performance, which could potentially be improved with further model tuning or different classification algorithms.", "The script successfully trained a Random Forest model on single cell RNA-seq data and predicted cell types for the test set. The model achieved an accuracy of 62.78% and F1 scores across different averages (macro, micro, weighted) ranging from 45.02% to 62.78%. These results indicate a moderate predictive performance.", "The code successfully executed and provided an accuracy of 62.78% along with F1 scores in various averages (macro: 45.02%, micro: 62.78%, weighted: 58.81%). This indicates a moderate performance by the RandomForestClassifier in projecting cell type labels on single cell RNA-seq data. The results suggest there is room for improvement in the model's prediction accuracy and F1 scores.", "The script successfully trained a Random Forest classifier on single-cell RNA-seq data and evaluated its performance on a test set. The cross-validated accuracy on the training set was approximately 69.7%, and the final evaluation on the test set yielded an accuracy of about 62.8%. The F1 scores in different averages (macro, micro, weighted) were also computed, showing varied performance across different averaging methods.", "The implementation of the single cell RNA-seq label projection using a RandomForestClassifier has achieved moderate performance. The accuracy is 0.6278, and the F1 scores vary, with a macro F1 of 0.4502, micro F1 of 0.6278, and weighted F1 of 0.5881. These results suggest that the model can correctly classify cell types to a certain extent, but there's room for improvement in handling class imbalances or enhancing the model's precision and recall.", "The Random Forest model achieved an accuracy of 62.78% and varying F1 scores (macro: 45.02%, micro: 62.78%, weighted: 58.81%) on the test set. This indicates a moderate performance in predicting cell types from single-cell RNA-seq data, suggesting potential room for model improvement or parameter tuning.", "The script successfully executed a Random Forest model for single cell RNA-seq label projection, achieving an accuracy of 0.6278 and F1 scores in various averages. The model appears to perform moderately well given the complexity of the task and data variability. The results indicate that the model could potentially be improved by tuning parameters or experimenting with different machine learning approaches.", "The code successfully executed and produced an accuracy of 0.6278 and F1 scores in different averages (macro: 0.4502, micro: 0.6278, weighted: 0.5881). These results indicate a moderate level of prediction accuracy, suggesting that the model has room for improvement in terms of classification performance across different cell types.", "The execution of the Random Forest model for single cell RNA-seq label projection resulted in an accuracy of 62.78% and F1-scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate a moderate performance of the model in classifying cell types, with room for improvement especially in balancing performance across classes (as indicated by the macro F1-score).", "The Random Forest model used for label projection in single-cell RNA-seq data achieved an accuracy of 62.78% and F1 scores of 45.02% (macro), 62.78% (micro), and 58.81% (weighted). These results indicate a reasonable performance, though there might be room for improvement in the model's ability to generalize across different cell types.", "The Random Forest classifier successfully projected cell type labels from the training set to the test set. The reported accuracy is approximately 62.78%, with F1 scores varying across different averages: macro at 45.02%, micro at 62.78%, and weighted at 58.81%. These results indicate that the classifier is reasonably effective, though there may be room for improvement especially in terms of balancing class predictions (as indicated by the macro F1 score).", "The Random Forest model for single cell RNA-seq label projection achieved an accuracy of 62.78% and F1 scores (macro, micro, weighted) of 45.02%, 62.78%, and 58.81% respectively. These results indicate a moderate performance in predicting cell types, suggesting potential room for model improvement or parameter tuning."], "exp_name": "2-warm-woodlouse-of-election", "metrics": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

let lastClick = 0;
let firstFrameTime = undefined;

let nodes = [];
let edges = [];

let lastScrollPos = 0;

setup = () => {
  canvas = createCanvas(...updateTargetDims());
};

class Node {
  x;
  y;
  size;
  xT;
  yT;
  xB;
  yB;
  treeInd;
  color;
  relSize;
  animationStart = Number.MAX_VALUE;
  animationProgress = 0;
  isStatic = false;
  hasChildren = false;
  isRootNode = true;
  isStarred = false;
  selected = false;
  renderSize = 10;
  edges = [];
  bgCol;

  constructor(x, y, relSize, treeInd) {
    const minSize = 35;
    const maxSize = 60;

    const maxColor = 10;
    const minColor = 125;

    this.relSize = relSize;
    this.treeInd = treeInd;
    this.size = minSize + (maxSize - minSize) * relSize;
    this.color = minColor + (maxColor - minColor) * relSize;
    this.bgCol = Math.round(Math.max(this.color / 2, 0));

    this.x = x;
    this.y = y;
    this.xT = x;
    this.yT = y - this.size / 2;
    this.xB = x;
    this.yB = y + this.size / 2;

    nodes.push(this);
  }

  startAnimation = (offset = 0) => {
    if (this.animationStart == Number.MAX_VALUE)
      this.animationStart = globalTime + offset;
  };

  child = (node) => {
    let edge = new Edge(this, node);
    this.edges.push(edge);
    edges.push(edge);
    this.hasChildren = true;
    node.isRootNode = false;
    return node;
  };

  render = () => {
    if (globalTime - this.animationStart < 0) return;

    const mouseXlocalCoords = (mouseX - width / 2) / scaleFactor;
    const mouseYlocalCoords = (mouseY - height / 2) / scaleFactor;
    const isMouseOver =
      dist(mouseXlocalCoords, mouseYlocalCoords, this.x, this.y) <
      this.renderSize / 1.5;
    if (isMouseOver) cursor(HAND);
    if (isMouseOver && mouseIsPressed) {
      nodes.forEach((n) => (n.selected = false));
      this.selected = true;
      setCodeAndPlan(
        treeStructData.code[this.treeInd],
        treeStructData.plan[this.treeInd],
      );
      manualSelection = true;
    }

    this.renderSize = this.size;
    if (!this.isStatic) {
      this.animationProgress = animEase(
        (globalTime - this.animationStart) / 1000,
      );
      if (this.animationProgress >= 1) {
        this.isStatic = true;
      } else {
        this.renderSize =
          this.size *
          (0.8 +
            0.2 *
              (-3.33 * this.animationProgress ** 2 +
                4.33 * this.animationProgress));
      }
    }

    fill(this.color);
    if (this.selected) {
      fill(accentCol);
    }

    noStroke();
    square(
      this.x - this.renderSize / 2,
      this.y - this.renderSize / 2,
      this.renderSize,
      10,
    );

    noStroke();
    textAlign(CENTER, CENTER);
    textSize(this.renderSize / 2);
    fill(255);
    // fill(lerpColor(color(accentCol), color(255), this.animationProgress))
    text("{ }", this.x, this.y - 1);
    // DEBUG PRINT:
    // text(round(this.relSize, 2), this.x, this.y - 1)
    // text(this.treeInd, this.x, this.y + 15)

    const dotAnimThreshold = 0.85;
    if (this.isStarred && this.animationProgress >= dotAnimThreshold) {
      let dotAnimProgress =
        (this.animationProgress - dotAnimThreshold) / (1 - dotAnimThreshold);
      textSize(
        ((-3.33 * dotAnimProgress ** 2 + 4.33 * dotAnimProgress) *
          this.renderSize) /
          2,
      );
      if (this.selected) {
        fill(0);
        stroke(0);
      } else {
        fill(accentCol);
        stroke(accentCol);
      }
      strokeWeight((-(dotAnimProgress ** 2) + dotAnimProgress) * 2);
      text("*", this.x + 20, this.y - 11);
      noStroke();
    }

    if (!this.isStatic) {
      fill(bgCol);
      const progressAnimBaseSize = this.renderSize + 5;
      rect(
        this.x - progressAnimBaseSize / 2,
        this.y -
          progressAnimBaseSize / 2 +
          progressAnimBaseSize * this.animationProgress,
        progressAnimBaseSize,
        progressAnimBaseSize * (1 - this.animationProgress),
      );
    }
    if (this.animationProgress >= 0.9) {
      this.edges
        .sort((a, b) => a.color() - b.color())
        .forEach((e, i) => {
          e.startAnimation((i / this.edges.length) ** 2 * 1000);
        });
    }
  };
}

class Edge {
  nodeT;
  nodeB;
  animX = 0;
  animY = 0;
  animationStart = Number.MAX_VALUE;
  animationProgress = 0;
  isStatic = false;
  weight = 0;

  constructor(nodeT, nodeB) {
    this.nodeT = nodeT;
    this.nodeB = nodeB;
    this.weight = 2 + nodeB.relSize * 1;
  }

  color = () => this.nodeB.color;

  startAnimation = (offset = 0) => {
    if (this.animationStart == Number.MAX_VALUE)
      this.animationStart = globalTime + offset;
  };

  render = () => {
    if (globalTime - this.animationStart < 0) return;

    if (!this.isStatic) {
      this.animationProgress = animEase(
        (globalTime - this.animationStart) / 1000,
      );
      if (this.animationProgress >= 1) {
        this.isStatic = true;
        this.animX = this.nodeB.xT;
        this.animY = this.nodeB.yT;
      } else {
        this.animX = bezierPoint(
          this.nodeT.xB,
          this.nodeT.xB,
          this.nodeB.xT,
          this.nodeB.xT,
          this.animationProgress,
        );

        this.animY = bezierPoint(
          this.nodeT.yB,
          (this.nodeT.yB + this.nodeB.yT) / 2,
          (this.nodeT.yB + this.nodeB.yT) / 2,
          this.nodeB.yT,
          this.animationProgress,
        );
      }
    }
    if (this.animationProgress >= 0.97) {
      this.nodeB.startAnimation();
    }

    strokeWeight(this.weight);
    noFill();
    stroke(
      lerpColor(color(bgCol), color(accentCol), this.nodeB.relSize * 1 + 0.7),
    );
    bezier(
      this.nodeT.xB,
      this.nodeT.yB,
      this.nodeT.xB,
      (this.nodeT.yB + this.nodeB.yT) / 2,
      this.animX,
      (this.nodeT.yB + this.nodeB.yT) / 2,
      this.animX,
      this.animY,
    );
  };
}

draw = () => {
  cursor(ARROW);
  frameRate(120);
  if (!firstFrameTime && frameCount <= 1) {
    firstFrameTime = millis();
  }
  // ---- update global animation state ----
  const initialSpeedScalingEaseIO =
    (cos(min((millis() - firstFrameTime) / 8000, 1.0) * PI) + 1) / 2;
  const initialSpeedScalingEase =
    (cos(min((millis() - firstFrameTime) / 8000, 1.0) ** (1 / 2) * PI) + 1) / 2;
  const initAnimationSpeedFactor = 1.0 - 0.4 * initialSpeedScalingEaseIO;
  // update global scaling-aware clock
  globalTime += globalAnimSpeed * initAnimationSpeedFactor * deltaTime;

  if (nodes.length == 0) {
    const spacingHeight = height * 1.3;
    const spacingWidth = width * 1.3;
    treeStructData.layout.forEach((lay, index) => {
      new Node(
        spacingWidth * lay[0] - spacingWidth / 2,
        20 + spacingHeight * lay[1] - spacingHeight / 2,
        1 - treeStructData.metrics[index],
        index,
      );
    });
    treeStructData.edges.forEach((ind) => {
      nodes[ind[0]].child(nodes[ind[1]]);
    });
    nodes.forEach((n) => {
      if (n.isRootNode) n.startAnimation();
    });
    nodes[0].selected = true;
    setCodeAndPlan(
      treeStructData.code[0],
      treeStructData.plan[0],
    )
  }

  const staticNodes = nodes.filter(
    (n) => n.isStatic || n.animationProgress >= 0.7,
  );
  if (staticNodes.length > 0) {
    const largestNode = staticNodes.reduce((prev, current) =>
      prev.relSize > current.relSize ? prev : current,
    );
    if (!manualSelection) {
      if (!largestNode.selected) {
        setCodeAndPlan(
          treeStructData.code[largestNode.treeInd],
          treeStructData.plan[largestNode.treeInd],
        );
      }
      staticNodes.forEach((node) => {
        node.selected = node === largestNode;
      });
    }
  }
  background(bgCol);
  // global animation transforms
  translate(width / 2, height / 2);
  scale(scaleFactor);

  
  // ---- fg render ----
  edges.forEach((e) => e.render());
  nodes.forEach((n) => n.render());
  
};

    </script>
    <title>AIDE Run Visualization</title>
    <style>
      body,
      * {
        margin: 0;
        padding: 0;
        box-sizing: border-box;
        overflow: scroll;
      }
      body {
        background-color: #f2f0e7;
      }
      canvas {
        float: left;
        height: 100vh;
        width: 40vw;
      }
      #text-container {
        float: right;
        height: 100vh;
        width: 50vw;
        background-color: #282c34;
      }
      #plan {
        /* border-left: 2px solid #282c34; */
        background-color: #282c34;
        color: #f2f0e7;
        min-height: 5rem;
        padding: 1em 0 1em 1em;
      }
    </style>
  </head>
  <body>
    <pre
      id="text-container"
    ><div id="plan"></div><hr><code id="code" class="language-python"></code></pre>
  </body>
</html>
