{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOivpPLjUQ6U9V0BB2FESCX"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"pMU5Fh9KsVYU","executionInfo":{"status":"ok","timestamp":1727814363631,"user_tz":240,"elapsed":5240,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"outputs":[],"source":["import numpy as np\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n","from sklearn.linear_model import LogisticRegression\n","\n","def logistic(z):\n","    return 1 / (1 + np.exp(-z))\n","\n","def GenerateData(n, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE):\n","    np.random.seed(SEED)\n","    inputData = np.random.normal(loc=paraMean, scale=paraSE, size=n)\n","    noise = np.random.normal(loc=noiseMean, scale=noiseSE, size=n)\n","    noisyInput = inputData + noise\n","\n","    trueLabelProbability = logistic(paraInInputModel * inputData+1)\n","    trueLabel = np.random.binomial(1, trueLabelProbability.flatten())\n","\n","    return inputData, noisyInput, trueLabel\n","\n","\n","def GenerateAugmentedData(n, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE):\n","    np.random.seed(2*SEED)\n","    inputData = np.random.normal(loc=paraMean, scale=paraSE, size=n)\n","    noise = np.random.normal(loc=noiseMean, scale=noiseSE, size=n)\n","    noisyInput = inputData + noise\n","\n","    trueLabelProbability = logistic(paraInInputModel * inputData+1)\n","    trueLabel = np.random.binomial(1, trueLabelProbability.flatten())\n","\n","    return inputData, trueLabel\n","\n","def LogisticTrueFunction(inputTrain, inputTest, labelTrain, labelTest):\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","    clf.fit(inputTrain.reshape(-1, 1), labelTrain)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    true_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                        recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return true_test_result\n","\n","def LogisticNaiveFunction(noisyInputTrain, inputTest, labelTrain, labelTest):\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","    clf.fit(noisyInputTrain.reshape(-1, 1), labelTrain)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    naive_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                         recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return naive_test_result\n","\n","def LogisticAugmentedCorrectionFunction(noisyInputTrain, inputTest, labelTrain, labelTest, augmentedInput, augmentedLabel):\n","    inputData = np.concatenate((noisyInputTrain, augmentedInput))\n","    label = np.concatenate((labelTrain, augmentedLabel))\n","    clf = LogisticRegression(solver='lbfgs', max_iter=400)\n","\n","    clf.fit(inputData.reshape(-1, 1), label)  # Reshape to 2D\n","    y_pred = clf.predict(inputTest.reshape(-1, 1))  # Reshape to 2D\n","    correction_test_result = [accuracy_score(labelTest, y_pred), precision_score(labelTest, y_pred),\n","                        recall_score(labelTest, y_pred), f1_score(labelTest, y_pred)]\n","    return correction_test_result\n"]},{"cell_type":"code","source":["true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","\n","Cases = [[-1, 0.2], [-1.2, 0.2], [-1.4, 0.2], [-1, 0.4], [-1, 0.6], [-1, 0.8]]\n","\n","for SEED in range(100):\n","    numAugmentedData = 10000\n","    numInEC = 1000\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","    augmented_test_result.append([])\n","    naive_test_result.append([])\n","    true_test_result.append([])\n","    for [noiseMean, noiseSE] in Cases:\n","        inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","        ###create the test data\n","        inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","        true_test_result[SEED].append(LogisticTrueFunction(inputData, inputTest, label, labelTest))\n","        naive_test_result[SEED].append(LogisticNaiveFunction(noisyInput, inputTest, label, labelTest))\n","        augmented_test_result[SEED].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n","\n","\n","\n"],"metadata":{"id":"HKerzL5Hvwkj","executionInfo":{"status":"ok","timestamp":1727814428913,"user_tz":240,"elapsed":65285,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["a1 = np.array(true_test_result)\n","b1 = np.array(naive_test_result)\n","c1 = np.array(augmented_test_result)\n","\n","print(a1.mean(axis=0))\n","print(b1.mean(axis=0))\n","print(c1.mean(axis=0))\n","\n","\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MEfFEdnMv0ii","executionInfo":{"status":"ok","timestamp":1727814428914,"user_tz":240,"elapsed":27,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"003165de-6d30-484c-8155-121063dbbd70"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["[[0.9664     0.97436967 0.98682425 0.98050297]\n"," [0.9664     0.97436967 0.98682425 0.98050297]\n"," [0.9664     0.97436967 0.98682425 0.98050297]\n"," [0.9664     0.97436967 0.98682425 0.98050297]\n"," [0.9664     0.97436967 0.98682425 0.98050297]\n"," [0.9664     0.97436967 0.98682425 0.98050297]]\n","[[0.87275    0.87086523 1.         0.93083895]\n"," [0.8672     0.86599464 1.         0.92803757]\n"," [0.86345    0.86272989 1.         0.92615441]\n"," [0.86865    0.86726588 1.         0.92876891]\n"," [0.86455    0.86368854 1.         0.9267059 ]\n"," [0.86085    0.86047655 1.         0.92485451]]\n","[[0.95095    0.94813833 0.99738721 0.97208362]\n"," [0.9418     0.9375075  0.99878114 0.96709758]\n"," [0.9308     0.92571708 0.99958532 0.96114269]\n"," [0.9493     0.94587167 0.99803395 0.9711956 ]\n"," [0.94655    0.9427759  0.99831748 0.96968871]\n"," [0.9422     0.93796762 0.99872174 0.96731553]]\n"]}]},{"cell_type":"code","source":["true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","numAugmentedDataList = [5000, 10000, 20000, 30000, 40000]\n","\n","for SEED in range(100):\n","  augmented_test_result.append([])\n","  idx = 0\n","  for numAugmentedData in numAugmentedDataList:\n","    augmented_test_result[SEED].append([])\n","    numInEC = 1000\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","    for [noiseMean, noiseSE] in Cases:\n","        inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","\n","        ###create the test data\n","        inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","\n","\n","        augmented_test_result[SEED][idx].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n","\n","    idx = idx +1\n"],"metadata":{"id":"riqGzSHtv7ix","executionInfo":{"status":"ok","timestamp":1727814559327,"user_tz":240,"elapsed":130417,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["\n","c2 = np.array(augmented_test_result)\n","\n","print(c2.mean(axis=0))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"HVuvORMCv-Kq","executionInfo":{"status":"ok","timestamp":1727814559328,"user_tz":240,"elapsed":20,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"68606a02-122f-46d4-c715-3de3f1abbed3"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["[[[0.9298     0.92471654 0.99958532 0.96060592]\n","  [0.9162     0.91107992 0.99994286 0.95335136]\n","  [0.9005     0.89609961 1.         0.94509986]\n","  [0.9267     0.92144832 0.99982237 0.95894502]\n","  [0.92005    0.91486664 0.99988225 0.95539362]\n","  [0.91295    0.90795535 0.99994286 0.95163405]]\n","\n"," [[0.95095    0.94813833 0.99738721 0.97208362]\n","  [0.9418     0.9375075  0.99878114 0.96709758]\n","  [0.9308     0.92571708 0.99958532 0.96114269]\n","  [0.9493     0.94587167 0.99803395 0.9711956 ]\n","  [0.94655    0.9427759  0.99831748 0.96968871]\n","  [0.9422     0.93796762 0.99872174 0.96731553]]\n","\n"," [[0.9615     0.96309804 0.99319513 0.97786546]\n","  [0.9573     0.95696443 0.99499451 0.97555795]\n","  [0.9529     0.9507457  0.99674582 0.97314542]\n","  [0.961      0.96219418 0.9935973  0.97759462]\n","  [0.9599     0.96068175 0.99394859 0.97698313]\n","  [0.95815    0.95807737 0.99476081 0.97602553]]\n","\n"," [[0.9649     0.96937616 0.99037564 0.9797236 ]\n","  [0.9625     0.96476658 0.99254888 0.97841378]\n","  [0.95955    0.96045734 0.99377385 0.97678098]\n","  [0.9641     0.96838798 0.99048996 0.97927083]\n","  [0.96365    0.96719086 0.99125307 0.97903025]\n","  [0.9628     0.96541489 0.99219411 0.9785759 ]]\n","\n"," [[0.9663     0.97217112 0.98903773 0.98048941]\n","  [0.9646     0.96920126 0.9902037  0.97954896]\n","  [0.963      0.96568188 0.99213579 0.97868363]\n","  [0.966      0.97162012 0.98926768 0.98032164]\n","  [0.96555    0.97073942 0.98967715 0.9800747 ]\n","  [0.96505    0.96975479 0.99014346 0.97980212]]]\n"]}]},{"cell_type":"code","source":["\n","Cases = [[-1, 0.2], [-1.2, 0.2], [-1.4, 0.2], [-1, 0.4], [-1, 0.6], [-1, 0.8]]\n","mismeasuredList = [[0, 0], [-0.2, 0.1], [0.2, -0.1], [-0.2, -0.1]]\n","true_test_result = []\n","naive_test_result = []\n","augmented_test_result = []\n","numAugmentedData = 10000\n","numInEC = 1000\n","for SEED in range(100):\n","    augmented_test_result.append([])\n","\n","    ##noiseMean = 1-1.5, noiseSE=1\n","    ##noiseMean = 1, noiseSE = 0.2-1\n","    numData, paraInInputModel, paraMean, paraSE = 1000, 10, 1, 1\n","\n","\n","    for [noiseMean, noiseSE] in Cases:\n","      augmented_test_result[SEED].append([])\n","      idx = Cases.index([noiseMean, noiseSE])\n","      inputData, noisyInput, label = GenerateData(numData, SEED, paraInInputModel, paraMean, paraSE, noiseMean, noiseSE)\n","      for [a1, a2] in mismeasuredList:\n","        augmentedInput, augmentedLabel = GenerateAugmentedData(numAugmentedData, SEED, paraInInputModel, paraMean, paraSE, noiseMean+a1, noiseSE+a2)\n","        inputTest, noisyInputTest, labelTest = GenerateData(200, SEED+1000, paraInInputModel, paraMean, paraSE, noiseMean+a1, noiseSE+a2)\n","\n","        #inputTrain, inputTest, noisyInputTrain, noisyInputTest, labelTrain, labelTest = train_test_split(\n","        #    inputData, noisyInput, label, test_size=0.2, random_state=123)\n","        augmented_test_result[SEED][idx].append(LogisticAugmentedCorrectionFunction(noisyInput, inputTest, label, labelTest, augmentedInput, augmentedLabel))\n",""],"metadata":{"id":"uTeEHX8pwDYt","executionInfo":{"status":"ok","timestamp":1727814635101,"user_tz":240,"elapsed":75789,"user":{"displayName":"胡平波","userId":"14290925349410595096"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":["\n","a4 = np.array(augmented_test_result)\n","print(a4.mean(axis=0))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Go4QLenXwFiE","executionInfo":{"status":"ok","timestamp":1727814635103,"user_tz":240,"elapsed":8,"user":{"displayName":"胡平波","userId":"14290925349410595096"}},"outputId":"c5be8c67-68f0-47e8-eda4-d45fe7df615c"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["[[[0.95095    0.94813833 0.99738721 0.97208362]\n","  [0.95095    0.94813833 0.99738721 0.97208362]\n","  [0.95095    0.94813833 0.99738721 0.97208362]\n","  [0.95095    0.94813833 0.99738721 0.97208362]]\n","\n"," [[0.9418     0.9375075  0.99878114 0.96709758]\n","  [0.9418     0.9375075  0.99878114 0.96709758]\n","  [0.9418     0.9375075  0.99878114 0.96709758]\n","  [0.9418     0.9375075  0.99878114 0.96709758]]\n","\n"," [[0.9308     0.92571708 0.99958532 0.96114269]\n","  [0.9308     0.92571708 0.99958532 0.96114269]\n","  [0.9308     0.92571708 0.99958532 0.96114269]\n","  [0.9308     0.92571708 0.99958532 0.96114269]]\n","\n"," [[0.9493     0.94587167 0.99803395 0.9711956 ]\n","  [0.9493     0.94587167 0.99803395 0.9711956 ]\n","  [0.9493     0.94587167 0.99803395 0.9711956 ]\n","  [0.9493     0.94587167 0.99803395 0.9711956 ]]\n","\n"," [[0.94655    0.9427759  0.99831748 0.96968871]\n","  [0.94655    0.9427759  0.99831748 0.96968871]\n","  [0.94655    0.9427759  0.99831748 0.96968871]\n","  [0.94655    0.9427759  0.99831748 0.96968871]]\n","\n"," [[0.9422     0.93796762 0.99872174 0.96731553]\n","  [0.9422     0.93796762 0.99872174 0.96731553]\n","  [0.9422     0.93796762 0.99872174 0.96731553]\n","  [0.9422     0.93796762 0.99872174 0.96731553]]]\n"]}]}]}