{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPOShWpIiE/HZb9FIbGEhVZ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"lv57SODdwyn0","executionInfo":{"status":"ok","timestamp":1727814915527,"user_tz":240,"elapsed":5797,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"outputs":[],"source":["import numpy as np\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n","from sklearn.linear_model import LogisticRegression\n","\n","def logistic(z):\n","    return 1 / (1 + np.exp(-z))\n","\n","def GenerateData(n, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE):\n","    np.random.seed(SEED)\n","    noisyInput = np.random.normal(loc=paraMean, scale=paraSE, size=n)\n","    noise = np.random.normal(loc=noiseMean, scale=noiseSE, size=n)\n","    inputData = noisyInput - noise\n","\n","    trueLabelProbability = logistic(paraInInputModel * inputData+1)\n","    trueLabel = np.random.binomial(1, trueLabelProbability.flatten())\n","\n","    return inputData, noisyInput, trueLabel\n","\n","\n","def GenerateAugmentedData(n, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE):\n","    np.random.seed(SEED)\n","    noisyInput = np.random.normal(loc=paraMean, scale=paraSE, size=n)\n","    noise = np.random.normal(loc=noiseMean, scale=noiseSE, size=n)\n","    inputData = noisyInput - noise\n","\n","    trueLabelProbability = logistic(paraInInputModel * inputData+1)\n","    trueLabel = np.random.binomial(1, trueLabelProbability.flatten())\n","\n","    return inputData, trueLabel\n","\n","def LogisticTrueFunction(inputTrain, inputTest, labelTrain, labelTest):\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","    clf.fit(inputTrain.reshape(-1, 1), labelTrain)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    true_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                        recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return true_test_result\n","\n","def LogisticNaiveFunction(noisyInputTrain, inputTest, labelTrain, labelTest):\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","    clf.fit(noisyInputTrain.reshape(-1, 1), labelTrain)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    naive_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                         recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return naive_test_result\n","\n","def LogisticAugmentedCorrectionFunction(noisyInputTrain, inputTest, labelTrain, labelTest, augmentedInput, augmentedLabel):\n","    inputData = np.concatenate((noisyInputTrain, augmentedInput))\n","    label = np.concatenate((labelTrain, augmentedLabel))\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","\n","    clf.fit(inputData.reshape(-1, 1), label)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    correction_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                        recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return correction_test_result\n","\n","\n","\n"]},{"cell_type":"code","source":["true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","\n","Cases = [[1, 1], [1.2, 1], [1.4, 1], [1, 0.2], [1, 0.5], [1, 0.8]]\n","\n","for SEED in range(100):\n","    numAugmentedData = 10000\n","    numInEC = 1000\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","    augmented_test_result.append([])\n","    naive_test_result.append([])\n","    true_test_result.append([])\n","    for [noiseMean, noiseSE] in Cases:\n","        inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","        inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","\n","        true_test_result[SEED].append(LogisticTrueFunction(inputData, inputTest, label, labelTest))\n","        naive_test_result[SEED].append(LogisticNaiveFunction(noisyInput, inputTest, label, labelTest))\n","        augmented_test_result[SEED].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n","\n","\n","\n"],"metadata":{"id":"QVuxz8m4yIOJ","executionInfo":{"status":"ok","timestamp":1727814976286,"user_tz":240,"elapsed":60761,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["a1 = np.array(true_test_result)\n","b1 = np.array(naive_test_result)\n","c1 = np.array(augmented_test_result)\n","\n","print(a1.mean(axis=0))\n","print(b1.mean(axis=0))\n","print(c1.mean(axis=0))\n","\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Lwab1VnmyM68","executionInfo":{"status":"ok","timestamp":1727814976286,"user_tz":240,"elapsed":24,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"bd13c010-ae6b-4c77-8fa0-fb3fb48b8d48"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0.96155    0.96183886 0.96439544 0.96293071]\n"," [0.9622     0.96298615 0.95604877 0.95927561]\n"," [0.9632     0.9560598  0.95421883 0.95482961]\n"," [0.94685    0.94761993 0.95237797 0.9497713 ]\n"," [0.9528     0.95422725 0.95639699 0.95511463]\n"," [0.95595    0.95888011 0.95735749 0.95785048]]\n","[[0.737      1.         0.49466531 0.66019477]\n"," [0.7083     1.         0.37510951 0.54380162]\n"," [0.7005     1.         0.26951094 0.42188717]\n"," [0.6543     1.         0.34823179 0.51477033]\n"," [0.678      1.         0.39048647 0.56002226]\n"," [0.71225    1.         0.45122982 0.6200154 ]]\n","[[0.9544     0.9837228  0.92786425 0.95479205]\n"," [0.95       0.98653901 0.90542936 0.94398128]\n"," [0.9449     0.98966963 0.87493616 0.92831351]\n"," [0.9431     0.97068456 0.92036174 0.94462019]\n"," [0.9471     0.97519406 0.92322466 0.94825216]\n"," [0.9504     0.97946172 0.92504504 0.95119549]]\n"]}]},{"cell_type":"code","source":["true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","numAugmentedDataList = [5000, 10000, 20000, 30000, 40000]\n","\n","for SEED in range(100):\n","  augmented_test_result.append([])\n","  idx = 0\n","  for numAugmentedData in numAugmentedDataList:\n","    augmented_test_result[SEED].append([])\n","    numInEC = 1000\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","    for [noiseMean, noiseSE] in Cases:\n","        inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","        inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","\n","\n","        augmented_test_result[SEED][idx].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n","\n","    idx = idx +1\n"],"metadata":{"id":"i6gfLvJHyPnI","executionInfo":{"status":"ok","timestamp":1727815112992,"user_tz":240,"elapsed":136709,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["\n","c2 = np.array(augmented_test_result)\n","\n","print(c2.mean(axis=0))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hrxCAg-FyQOa","executionInfo":{"status":"ok","timestamp":1727815112992,"user_tz":240,"elapsed":8,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"ab8ff268-d4d6-4574-fc06-426a8fd151a6"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["[[[0.94025    0.99285922 0.89175361 0.93931317]\n","  [0.9256     0.99433875 0.84572203 0.91366087]\n","  [0.9165     0.99734088 0.79881045 0.88642304]\n","  [0.9315     0.98332385 0.88591113 0.93180079]\n","  [0.9323     0.9856286  0.88474561 0.93215443]\n","  [0.93645    0.98965636 0.88820948 0.93584853]]\n","\n"," [[0.9544     0.9837228  0.92786425 0.95479205]\n","  [0.95       0.98653901 0.90542936 0.94398128]\n","  [0.9449     0.98966963 0.87493616 0.92831351]\n","  [0.9431     0.97068456 0.92036174 0.94462019]\n","  [0.9471     0.97519406 0.92322466 0.94825216]\n","  [0.9504     0.97946172 0.92504504 0.95119549]]\n","\n"," [[0.96015    0.97548682 0.94721878 0.96096427]\n","  [0.9582     0.97659649 0.93294949 0.95404676]\n","  [0.9577     0.97818227 0.91743793 0.94653359]\n","  [0.9459     0.96103463 0.93577256 0.94804813]\n","  [0.952      0.9669587  0.94118591 0.95371599]\n","  [0.95475    0.97129301 0.94174105 0.9560316 ]]\n","\n"," [[0.96035    0.97019686 0.95316549 0.96139825]\n","  [0.9596     0.97227609 0.94045277 0.9558743 ]\n","  [0.96035    0.97226033 0.92993814 0.95038511]\n","  [0.94645    0.95688948 0.94129709 0.94885507]\n","  [0.9531     0.9637714  0.94663372 0.95493874]\n","  [0.9551     0.96732379 0.94655236 0.95656034]]\n","\n"," [[0.961      0.96855131 0.95615739 0.96212911]\n","  [0.9602     0.96968912 0.94432737 0.95661729]\n","  [0.96105    0.96815582 0.93579312 0.95142611]\n","  [0.94645    0.95421275 0.94429326 0.94905133]\n","  [0.95355    0.96203295 0.94948598 0.955516  ]\n","  [0.9552     0.96508734 0.94911873 0.95676459]]]\n"]}]},{"cell_type":"code","source":["noiseMeanSEList = [[1, 1], [1.2, 1], [1.4, 1], [1, 0.2], [1, 0.5], [1, 0.8]]\n","mismeasuredList = [[0, 0], [0.2, 0.1], [-0.2, -0.1], [0.2, -0.1]]\n","true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","\n","for SEED in range(100):\n","    augmented_test_result.append([])\n","    numAugmentedData = 10000\n","    numInEC = 1000\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","\n","\n","    for [noiseMean, noiseSE] in noiseMeanSEList:\n","      augmented_test_result[SEED].append([])\n","      idx = noiseMeanSEList.index([noiseMean, noiseSE])\n","      inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","      inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","      for [a1, a2] in mismeasuredList:\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean+a1, noiseSE+a2)\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","        augmented_test_result[SEED][idx].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n",""],"metadata":{"id":"293OD1SDyVmn","executionInfo":{"status":"ok","timestamp":1727815185242,"user_tz":240,"elapsed":72253,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":["a4 = np.array(augmented_test_result)\n","\n","print(a4.mean(axis=0))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"w9BoaDxKyYPN","executionInfo":{"status":"ok","timestamp":1727815185243,"user_tz":240,"elapsed":9,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"6f3cf2e1-cce0-4e35-ef9c-378e12182b74"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["[[[0.9544     0.9837228  0.92786425 0.95479205]\n","  [0.95205    0.98549601 0.92152884 0.95225579]\n","  [0.95615    0.98210431 0.93289058 0.95669537]\n","  [0.95355    0.98439486 0.92552765 0.95385815]]\n","\n"," [[0.95       0.98653901 0.90542936 0.94398128]\n","  [0.946      0.98821595 0.89522621 0.93916458]\n","  [0.9521     0.98423589 0.91215858 0.94658377]\n","  [0.9472     0.9874429  0.89848069 0.94060949]]\n","\n"," [[0.9449     0.98966963 0.87493616 0.92831351]\n","  [0.9406     0.99168988 0.8625153  0.92207501]\n","  [0.9486     0.98625165 0.88709436 0.93357966]\n","  [0.94145    0.99168972 0.86458914 0.92330595]]\n","\n"," [[0.9431     0.97068456 0.92036174 0.94462019]\n","  [0.94185    0.97336872 0.91525554 0.94320304]\n","  [0.9444     0.96896081 0.92457281 0.94602125]\n","  [0.94245    0.97269206 0.91703631 0.94382266]]\n","\n"," [[0.9471     0.97519406 0.92322466 0.94825216]\n","  [0.9452     0.97711482 0.91764917 0.94617528]\n","  [0.9482     0.97353017 0.9269855  0.94942955]\n","  [0.9461     0.97618903 0.92029331 0.9471557 ]]\n","\n"," [[0.9504     0.97946172 0.92504504 0.95119549]\n","  [0.9486     0.98078654 0.92015936 0.94921783]\n","  [0.95185    0.97772226 0.92960253 0.9527816 ]\n","  [0.94965    0.98013716 0.92285058 0.95034351]]]\n"]}]}]}