{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "### load images from path, and get their embeddings from a pre-trained model (ViT), and save them to a file, for later use\n",
    "### this script is for the case when the images are not in the dataset, but in a folder\n",
    "### the embeddings are saved as a numpy array, with the same order as the images\n",
    "### the images are loaded, resized to 224x224, and normalized\n",
    "### the embeddings are extracted from the pre-trained model, and saved to a file\n",
    "\n",
    "import os\n",
    "import numpy as np\n",
    "import torch\n",
    "from PIL import Image\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "### get the model from the huggingface library\n",
    "from transformers import ViTFeatureExtractor, ViTModel, ViTImageProcessor\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from torchvision.transforms import Compose, Resize, ToTensor, Normalize\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pyplot as plt\n",
    "### import svm\n",
    "from sklearn.svm import SVC as SVM\n",
    "### make_pipeline\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.metrics import accuracy_score\n",
    "from small_text.classifiers import ConfidenceEnhancedLinearSVC\n",
    "from small_text.classifiers.factories import SklearnClassifierFactory\n",
    "from small_text import (\n",
    "    PoolBasedActiveLearner,\n",
    "    AnchorSubsampling,\n",
    "    PredictionEntropy,\n",
    "    random_initialization_balanced,\n",
    "    DiscriminativeActiveLearning,\n",
    "    SEALS\n",
    ")\n",
    "from small_text.data import SklearnDataset\n",
    "\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from small_text.classifiers import SklearnClassifier\n",
    "### import Linear Regression from sklearn and Lasso\n",
    "from sklearn import linear_model\n",
    "from sklearn.svm import SVR,SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(active_learner, hard_dataset, easy_dataset):\n",
    "    y_pred_hard = active_learner.classifier.predict(hard_dataset)\n",
    "    y_pred_easy = active_learner.classifier.predict(easy_dataset)\n",
    "    \n",
    "    test_acc_hard = accuracy_score(y_pred_hard, hard_dataset.y)\n",
    "    test_acc_easy = accuracy_score(y_pred_easy, easy_dataset.y)\n",
    "\n",
    "    # print(f\"Test Accuracy Hard: {test_acc_hard}\")\n",
    "    # print(f\"Test Accuracy Easy: {test_acc_easy}\")\n",
    "    \n",
    "    return test_acc_easy,test_acc_hard\n",
    "### load the model\n",
    "model_name = \"google/vit-base-patch16-224\"  # Replace with your desired model\n",
    "model = ViTModel.from_pretrained(model_name)\n",
    "feature_extractor = ViTImageProcessor.from_pretrained(model_name)\n",
    "def preprocess(image):\n",
    "    inputs = feature_extractor(images=image, return_tensors=\"pt\")\n",
    "    return inputs\n",
    "def load_image(image_path):\n",
    "    image = Image.open(image_path).convert('RGB')\n",
    "    return image\n",
    "# Function to obtain embeddings from the ViT model\n",
    "def get_vit_embeddings(image_path):\n",
    "    image = load_image(image_path)\n",
    "    image = preprocess(image)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        outputs = model(**image)\n",
    "\n",
    "    # Obtain the embeddings from the last hidden state\n",
    "    embeddings = outputs.last_hidden_state\n",
    "    return embeddings\n",
    "\n",
    "\n",
    "def return_difficult_samples(df_new_train_easy_random,df_new_embedding,df_new,df_new_test_hard,df_new_test_easy,n_hard):\n",
    "    #### Exploit Step\n",
    "    lasso_model = linear_model.Lasso( max_iter=100000,alpha=0.05)\n",
    "    \n",
    "    scaler = StandardScaler()\n",
    "\n",
    "    X_train = df_new_embedding[df_new_train_easy_random]\n",
    "    y_train = df_new.difficulty_score[df_new_train_easy_random]\n",
    "    X_train = scaler.fit_transform(X_train)\n",
    "    clf = make_pipeline(StandardScaler(), lasso_model)\n",
    "    clf.fit(X_train, y_train)\n",
    "    top_features = np.argwhere(clf.named_steps['lasso'].coef_!=0).flatten()\n",
    "    ### train a linear regression model on the embeddings on the top features\n",
    "    X = df_new_embedding[df_new_train_easy_random][:,top_features]\n",
    "    X = scaler.fit_transform(X)\n",
    "    y = df_new.difficulty_score[df_new_train_easy_random]\n",
    "    ### increase convergence tolerance\n",
    "    lm =linear_model.LinearRegression()\n",
    "\n",
    "    lm.fit(X, y)\n",
    "    \n",
    "    X_test = df_new_embedding[:,top_features]\n",
    "    X_test = scaler.transform(X_test)\n",
    "    y_test = df_new.difficulty_score\n",
    "    y_pred = lm.predict(X_test)\n",
    "    sorted_indices = np.argsort(y_pred)\n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_train_easy_random]\n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_test_hard]\n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_test_easy]\n",
    "    return sorted_indices[-n_hard:]\n",
    "def return_difficult_samples_classifier(df_new_train_easy_random,df_new_embedding,df_new,df_new_test_hard,df_new_test_easy,n_hard):\n",
    "    # lasso_model = linear_model.Lasso( max_iter=100000,alpha=0.1)\n",
    "    \n",
    "    # scaler = StandardScaler()\n",
    "    scaler = StandardScaler()\n",
    "    X_train = df_new_embedding[df_new_train_easy_random]\n",
    "    y_train = df_new.difficulty_score[df_new_train_easy_random]>3.8\n",
    "    X_train = scaler.fit_transform(X_train)\n",
    "    clf =  ConfidenceEnhancedLinearSVC() #SVC(kernel=\"rbf\",probability=True,class_weight=\"balanced\")\n",
    "    clf.fit(X_train, y_train)\n",
    "    \n",
    "    \n",
    "    X_test = df_new_embedding[:,:]\n",
    "    X_test = scaler.transform(X_test)\n",
    "    y_test = df_new.difficulty_score>3.8\n",
    "    y_pred = clf.predict_proba(X_test)[:,1]\n",
    "    sorted_indices = np.argsort(y_pred)\n",
    "    \n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_train_easy_random]\n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_test_hard]\n",
    "    sorted_indices = [i for i in sorted_indices if i not in df_new_test_easy]\n",
    "    print(y_test[sorted_indices[-n_hard:]].sum())\n",
    "    return sorted_indices[-n_hard:]\n",
    "def train_clf(df_subset,df_embeddings,df_labels):\n",
    "    X = df_embeddings[df_subset]\n",
    "    y = df_labels[df_subset]\n",
    "    scaler = StandardScaler()\n",
    "    X_train = scaler.fit_transform(X)\n",
    "    clf = make_pipeline(StandardScaler(), SVC(class_weight=\"balanced\",max_iter=10000) )\n",
    "    clf.fit(X_train, y)\n",
    "    return clf,scaler\n",
    "def return_metric(clf_model,df_test_easy,df_test_hard,df_embedding,df_labels,scaler):\n",
    "    \n",
    "    X_test = df_embedding[df_test_easy]\n",
    "    X_test = scaler.transform(X_test)\n",
    "    y_test = df_labels[df_test_easy]\n",
    "    y_pred = clf_model.predict(X_test)\n",
    "    easy_accuracy = (y_test==y_pred).mean()\n",
    "    X_test = df_embedding[df_test_hard]\n",
    "    X_test = scaler.transform(X_test)\n",
    "    y_test = df_labels[df_test_hard]\n",
    "    y_pred = clf_model.predict(X_test)\n",
    "    hard_accuracy = (y_test==y_pred).mean()\n",
    "    return easy_accuracy,hard_accuracy\n",
    " ### get good vectors by maximizing the minimum eigenvalue of the covariance matrix\n",
    "    import cvxpy as cp\n",
    "    def get_good_vectors(embeddings):\n",
    "        n = embeddings.shape[0]\n",
    "        d = embeddings.shape[1]\n",
    "        x = cp.Variable(n)\n",
    "        constraints = [cp.sum(x)==1,x>=0]\n",
    "        objective_function = embeddings.T @ cp.diag(x) @ embeddings\n",
    "        objective = cp.Maximize(cp.lambda_min(objective_function))\n",
    "        prob = cp.Problem(objective,constraints)\n",
    "        prob.solve()\n",
    "        return x.value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### load the images\n",
    "path = \"data/image_classification/VOC2012\"  #master path\n",
    "objects = os.listdir(path+\"/ImageSets/Main\")  #get the objects\n",
    "path_images = path+\"/JPEGImages\"  #path to the images\n",
    "lower_bound = 3.1\n",
    "upper_bound = 3.9\n",
    "object_names = [\"chair_trainval\",\"car_trainval\",\"bottle_trainval\",]\n",
    "\n",
    "n_easy = 120\n",
    "n_difficult = 100\n",
    "num_easy_train = 30\n",
    "num_difficult_train = 90\n",
    "N_random = 5\n",
    "N_queries = 1\n",
    "results = np.zeros((len(object_names),N_random,2,N_queries+1, 2))\n",
    "metrics = np.zeros((len(object_names),N_random,3,2))\n",
    "active_learning_init = num_easy_train\n",
    "num_sample_per_query = num_difficult_train\n",
    "dfs = {}\n",
    "for i in range(len(objects)):\n",
    "    dfs[objects[i].split(\".\")[0]] = pd.read_csv(path+\"/ImageSets/Main/\"+objects[i],header=None,sep='\\s+',names=[\"image\",\"label\"])\n",
    "def return_image_paths(object_name):\n",
    "    return [path_images+\"/\"+image+\".jpg\" for image in dfs[object_name][\"image\"].values]\n",
    "for k,object_name in enumerate(object_names):\n",
    "    object_name_just = object_name.split(\"_\")[0]\n",
    "    paths = return_image_paths(object_name)\n",
    "    task_input = pd.read_csv(\"./data/image_classification/task-input.csv\")\n",
    "    task_input['image'] = task_input['imageurl'].apply(lambda x:x.split(\".\")[0])\n",
    "    vsd_data = pd.read_csv(\"./data/image_classification/VSD_dataset.csv\",names=[\"image\",\"difficulty_score\"])\n",
    "    person_paths = task_input[task_input['class'] == object_name_just]['imageurl'].values\n",
    "    #### Uncomment incase embeddings are not saved\n",
    "    # embeddings = np.zeros((len(person_paths),768))\n",
    "    # for i in tqdm(range(len(person_paths))):\n",
    "    #     embeddings[i] = get_vit_embeddings(\"data/image_classification/VOC2012/JPEGImages/\"+person_paths[i])[:,0,:].numpy()\n",
    "    # np.save(\"data/image_classification/embeddings/\"+object_name+\".npy\",embeddings)\n",
    "    embeddings = np.load(\"data/image_classification/embeddings/\"+object_name+\".npy\")\n",
    "\n",
    "\n",
    "    df = pd.DataFrame(embeddings)\n",
    "    df[\"image\"] = person_paths\n",
    "    df_person = dfs[object_name]\n",
    "    df['image'] = df.image.apply(lambda x: x.split(\".\")[0])\n",
    "    df = pd.merge(df,df_person,on=\"image\")\n",
    "    difficulty_score = pd.read_csv(\"./data/image_classification/VSD_dataset.csv\",names=[\"image\",\"difficulty_score\"])\n",
    "    df['image'] = df['image'].apply(lambda x: x.split(\".\")[0])\n",
    "    df = df.merge(difficulty_score,on=\"image\")\n",
    "    df_embedding = df.iloc[:,:768].values\n",
    "    df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)\n",
    "    df_new = df.copy()\n",
    "\n",
    "   \n",
    "    ### pca from scikit learn\n",
    "    pca = PCA(n_components=10)\n",
    "    pca.fit(df_new.iloc[:,:768].values)\n",
    "    df_new_pca = pca.transform(df_new.iloc[:,:768].values)\n",
    "    good_vectors = get_good_vectors(df_new_pca)\n",
    "    top_d_good_vectors = np.argsort(good_vectors)[-500:]\n",
    "\n",
    "    clf_template = ConfidenceEnhancedLinearSVC()\n",
    "    num_classes = 2\n",
    "    clf_factory = SklearnClassifierFactory(clf_template, num_classes)\n",
    "    np.random.seed(0)\n",
    "\n",
    "    for i in tqdm(range(N_random)):\n",
    "        df_new_test_easy = np.random.choice(df_new[df_new.difficulty_score<=lower_bound].index,n_easy,replace=False)\n",
    "        df_new_test_hard = np.random.choice(df_new[df_new.difficulty_score>=upper_bound].index,n_difficult,replace=False)\n",
    "        df_new_train_easy = df_new[(df_new.difficulty_score<=lower_bound)& ~(df_new.index.isin(df_new_test_easy))].index\n",
    "        df_new_train_hard = df_new[(df_new.difficulty_score>=upper_bound)& ~(df_new.index.isin(df_new_test_hard))].index\n",
    "        top_d_good_vectors_allowed = [i for i in top_d_good_vectors if (i not in df_new_test_easy) and (i not in df_new_test_hard)]\n",
    "        df_new_embedding = df_new.iloc[:,:768].values\n",
    "        df_new_label = df_new.label.values\n",
    "        df_all = np.concatenate([df_new_train_easy,df_new_train_hard])\n",
    "        # ### train an svm on easy images\n",
    "        df_new_train_easy_random = np.random.choice(df_all,num_easy_train+num_difficult_train,replace=False)\n",
    "        clf,scaler = train_clf(df_new_train_easy_random,df_new_embedding,df_new_label)\n",
    "        metrics[k,i,0] = return_metric(clf,df_new_test_easy,df_new_test_hard,df_new_embedding,df_new_label,scaler)\n",
    "       \n",
    "        #### train on all images      \n",
    "        clf_all,scaler = train_clf(df_all,df_new_embedding,df_new_label)\n",
    "        metrics[k,i,1] = return_metric(clf_all,df_new_test_easy,df_new_test_hard,df_new_embedding,df_new_label,scaler)\n",
    "\n",
    "\n",
    "        ## train on random and predicted difficult images\n",
    "        df_new_train_easy_random = np.random.choice(top_d_good_vectors_allowed,num_easy_train,replace=False)\n",
    "        #clf_random_difficult,scaler = train_clf(df_new_train_easy_random,df_new_embedding,df_new_label)\n",
    "        df_difficult = return_difficult_samples(df_new_train_easy_random,df_new_embedding,df_new,df_new_test_hard,df_new_test_easy,n_hard = num_difficult_train)\n",
    "        df_mixed = np.concatenate([df_new_train_easy_random,df_difficult])\n",
    "        clf_difficult,scaler = train_clf(df_mixed,df_new_embedding,df_new_label)\n",
    "        metrics[k,i,2,:] = return_metric(clf_difficult,df_new_test_easy,df_new_test_hard,df_new_embedding,df_new_label,scaler)\n",
    "        \n",
    "        ## active learning\n",
    "        df_new_label = df_new.label.values==1\n",
    "\n",
    "        x = df_new_embedding[np.concatenate([df_new_train_easy,df_new_train_hard])]\n",
    "        y = df_new_label[np.concatenate([df_new_train_easy,df_new_train_hard])]\n",
    "        x_test_hard = df_new_embedding[df_new_test_hard]\n",
    "        y_test_hard = df_new_label[df_new_test_hard]\n",
    "        x_test_easy = df_new_embedding[df_new_test_easy]\n",
    "        y_test_easy = df_new_label[df_new_test_easy]\n",
    "\n",
    "        dataset = SklearnDataset(x, y, target_labels=np.arange(2))\n",
    "        hard_dataset = SklearnDataset(x_test_hard, y_test_hard, target_labels=np.arange(2))\n",
    "        easy_dataset = SklearnDataset(x_test_easy, y_test_easy, target_labels=np.arange(2))\n",
    "        learner = PoolBasedActiveLearner(\n",
    "            clf_factory,\n",
    "            AnchorSubsampling(PredictionEntropy(),num_anchors=2),\n",
    "            dataset,\n",
    "        )\n",
    "        random_indices = np.random.choice(np.arange(x.shape[0]), size=active_learning_init, replace=False)\n",
    "        indices_labeled = random_indices\n",
    "        learner.initialize_data(random_indices, y[random_indices])\n",
    "\n",
    "        results[k,i,0, 0, :] = evaluate(learner, hard_dataset, easy_dataset)\n",
    "\n",
    "            \n",
    "        for j in range(N_queries):\n",
    "            # ...where each iteration consists of labelling 20 samples\n",
    "            indices_queried = learner.query(num_samples=num_sample_per_query)\n",
    "\n",
    "            # Simulate user interaction here. Replace this for real-world usage.\n",
    "            y_true = dataset.y[indices_queried]\n",
    "\n",
    "            # Return the labels for the current query to the active learner.\n",
    "            learner.update(y_true)\n",
    "\n",
    "            indices_labeled = np.concatenate([indices_queried, indices_labeled])\n",
    "            \n",
    "            #print(f'Iteration #{i} ({len(indices_labeled)} samples)')\n",
    "            results[k,i,0, j+1, :] = evaluate(learner, hard_dataset, easy_dataset)\n",
    "            \n",
    "        learner = PoolBasedActiveLearner(\n",
    "            clf_factory,\n",
    "            SEALS(PredictionEntropy(),),\n",
    "            dataset,\n",
    "        )\n",
    "        random_indices = np.random.choice(np.arange(x.shape[0]), size=active_learning_init, replace=False)\n",
    "        indices_labeled = random_indices\n",
    "        learner.initialize_data(random_indices, y[random_indices])\n",
    "\n",
    "        results[k,i,1, 0, :] = evaluate(learner, hard_dataset, easy_dataset)\n",
    "            \n",
    "        for j in range(N_queries):\n",
    "            # ...where each iteration consists of labelling 20 samples\n",
    "            indices_queried = learner.query(num_samples=num_sample_per_query)\n",
    "\n",
    "            # Simulate user interaction here. Replace this for real-world usage.\n",
    "            y_true = dataset.y[indices_queried]\n",
    "\n",
    "            # Return the labels for the current query to the active learner.\n",
    "            learner.update(y_true)\n",
    "\n",
    "            indices_labeled = np.concatenate([indices_queried, indices_labeled])\n",
    "            results[k,i,1, j+1, :] = evaluate(learner, hard_dataset, easy_dataset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_results = np.round(100*np.concatenate([results.mean(axis=1)[:,:,-1,:].T,metrics.mean(axis=1).T],axis=1),1)\n",
    "std_results = np.round(100*np.concatenate([results.std(axis=1)[:,:,-1,:].T,metrics.std(axis=1).T],axis=1),1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(object_names)):\n",
    "    mean_results_object = mean_results[:,:,i].flatten()\n",
    "    std_results_object = std_results[:,:,i].flatten()\n",
    "    string_results = [str(mean_results_object[i])+r\"$\\pm$\"+str(std_results_object[i]) for i in range(len(mean_results_object))]\n",
    "    print(\" & \".join(string_results[:5]))\n",
    "    print(\" & \".join(string_results[5:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\" & \".join(string_results[15:20]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
