{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d43bd564-ff37-4e89-a2c5-88efc27d36fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "from kLLMmeans import kLLMmeans, get_embeddings, summarize_cluster, sequentialMiniBatchKmeans, miniBatchKLLMeans\n",
    "from experiment_utils import load_dataset, cluster_metrics, avg_closest_distance\n",
    "from sklearn.cluster import KMeans, MiniBatchKMeans\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import numpy as np\n",
    "import json, pickle\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ebb6dd27-eacd-4727-b361-8df2121fc976",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No previous results\n",
      "10\n",
      "[2020, 0, 50, 3]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  File \"C:\\Users\\jairo\\anaconda3\\Lib\\site-packages\\joblib\\externals\\loky\\backend\\context.py\", line 257, in _count_physical_cores\n",
      "    cpu_info = subprocess.run(\n",
      "               ^^^^^^^^^^^^^^^\n",
      "  File \"C:\\Users\\jairo\\anaconda3\\Lib\\subprocess.py\", line 548, in run\n",
      "    with Popen(*popenargs, **kwargs) as process:\n",
      "         ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"C:\\Users\\jairo\\anaconda3\\Lib\\subprocess.py\", line 1026, in __init__\n",
      "    self._execute_child(args, executable, preexec_fn, close_fds,\n",
      "  File \"C:\\Users\\jairo\\anaconda3\\Lib\\subprocess.py\", line 1538, in _execute_child\n",
      "    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,\n",
      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "Iterating KMeans: 100%|██████████████████████████████████████████████████████████████████| 3/3 [04:51<00:00, 97.23s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [05:47<00:00, 115.98s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [05:39<00:00, 113.22s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [05:42<00:00, 114.17s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['How do different components and techniques in neural networks, such as activation functions, loss functions, weight initialization, and regularization, contribute to the training process and affect the performance and convergence of the model?', 'How can neural networks, particularly transformers and other attention-based models, be effectively utilized and adapted for various natural language processing tasks, such as text generation, translation, and simplification, while addressing challenges like handling variable input/output lengths, incorporating prior knowledge through embeddings, and ensuring efficient training and inference?', 'How can one mathematically define afterstate value functions, including their Bellman equation, and understand their relationship to state and state-action value functions, while also addressing challenges such as local optima in reinforcement learning, exploration strategies, and the use of different divergence measures for policy comparison?', 'How can I better understand and visualize complex neural network concepts, such as equations in research papers, the impact of network architecture on performance, and the role of different components like attention mechanisms and convolutional layers, while also considering practical applications like transfer learning and continuous learning?', 'How can various search algorithms, such as Monte Carlo Tree Search (MCTS), Alpha-Beta Pruning, and Genetic Algorithms, be effectively utilized and optimized for different AI applications, including game playing, automated theorem proving, and optimization problems, while addressing challenges like exploration-exploitation trade-offs, convergence guarantees, and computational feasibility?', 'How do different machine learning concepts, such as model weights, learning paradigms (offline vs. online), and optimization algorithms, relate to solving complex problems in various domains, and what are some practical examples or applications of these concepts in fields like reinforcement learning, genetic algorithms, and pattern recognition?', 'How can I effectively address common challenges in training machine learning models, such as ensuring reproducibility with fixed seeds, managing overfitting, handling imbalanced datasets, and optimizing model performance with techniques like data augmentation, transfer learning, and ensemble methods?', 'How can artificial intelligence be effectively utilized across various domains, such as game development, economic valuation, numerical algorithms, and ecological research, while considering the ethical and technical constraints of AI knowledge, computational costs, and the potential for AI to predict human behavior or generate creative content?', 'How can I use computer vision techniques to detect and estimate the location of a soccer ball on a field using live camera feeds, considering different camera setups and angles, and what are the best practices for training object detection models with limited or incomplete data annotations?', 'How can reinforcement learning (RL) algorithms, such as DQN and Q-learning, be adapted or configured to handle various challenges like large state spaces, continuous action spaces, delayed rewards, and complex environments, while ensuring efficient learning and generalization?']\n",
      "No previous results\n",
      "10\n",
      "[2021, 0, 50, 3]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [05:48<00:00, 116.09s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [06:14<00:00, 124.69s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [06:06<00:00, 122.19s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['How can I optimize the training process of a neural network, considering factors like gradient clipping, batch size, and hyperparameter tuning, while addressing issues such as overfitting, model complexity, and data augmentation to improve model performance and generalization across different datasets and conditions?', 'How can I effectively utilize AI techniques, such as convolutional neural networks (CNNs) and object detection models, for tasks involving image data, including detecting specific objects or features, handling imbalanced datasets, and improving model performance through techniques like transfer learning, data augmentation, and specialized loss functions?', 'What are the best practices and approaches for handling various machine learning challenges, such as dealing with small datasets, feature importance, class imbalance, model interpretability, and dataset splitting, while considering different types of data and tasks like time series prediction, clustering, and classification?', 'How can different AI approaches, such as symbolic AI, neural networks, and hybrid systems, be effectively utilized or combined to achieve artificial general intelligence (AGI), considering factors like computational efficiency, resource constraints, and the integration of various AI techniques, including knowledge engineering, neuro-symbolic methods, and reinforcement learning, while addressing challenges related to safety, scalability, and adaptability in diverse applications?', 'How can reinforcement learning algorithms effectively approximate value functions and handle large state spaces, particularly when using techniques like TD-learning, Q-learning, and policy iteration, while considering factors such as learning rates, state redundancy, and the choice between value and policy functions?', 'How can I effectively understand and interpret complex mathematical concepts and methodologies in AI research papers, such as the differences between frequentist and Bayesian probability, stochastic approximation, adversarial attacks in neural networks, and other advanced topics, given my current foundational knowledge in mathematics and probability theory?', 'How can I effectively utilize pre-trained language models and NLP techniques to handle tasks such as text translation, entity recognition, and text classification, while addressing challenges like sequence length limitations, domain-specific vocabulary, and the need for accurate alignment between text and audio in multilingual contexts?', 'How can I effectively address challenges in reinforcement learning, such as handling large action spaces, designing appropriate reward functions, and dealing with stochastic environments, while considering the use of different algorithms like DDPG, SARSA, and policy networks, and ensuring efficient training and evaluation processes?', 'How do evolutionary algorithms like NEAT compare to traditional reinforcement learning algorithms such as PPO or SAC, and what are the considerations for using them in scenarios involving multi-objective optimization, unknown search spaces, or dynamic data streams?', 'How do various neural network architectures, such as Transformers, CNNs, and RNNs, handle specific design choices like intermediate dense layers, positional encodings, and input size variations, and what are the implications of these choices on model performance and implementation?']\n",
      "No previous results\n",
      "10\n",
      "[2022, 0, 50, 3]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [06:32<00:00, 130.81s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [06:13<00:00, 124.61s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['How can I address issues with training stability, loss function selection, and gradient problems in neural networks, while ensuring proper initialization, normalization, and understanding of activation functions and their derivatives?', 'How can I effectively preprocess and encode various types of data (e.g., time series, categorical, quantitative) for input into neural networks, and what are the best practices for handling different data types, such as embedding categorical variables, normalizing quantitative data, and dealing with high-dimensional or complex data structures like tensors or graphs?', 'How do language models like ChatGPT understand and process context, commands, and novel instructions without explicit programming logic, and how can they be fine-tuned or adapted for specific tasks such as question answering with multiple contexts or maintaining topic relevance in generated responses?', 'How can reinforcement learning algorithms be adapted or optimized for environments with large or infinite state spaces, continuous action spaces, or complex dynamics, such as multi-agent systems, games with imperfect information, or tasks requiring specific reward structures and exploration strategies?', 'How can I effectively evaluate and improve the performance of machine learning models, considering issues like overfitting, data imbalance, and model selection, while also ensuring that the training, validation, and test datasets are appropriately split and representative of the overall data distribution?', 'How can I choose the most suitable machine learning model or algorithm for a specific task, such as optimizing combinatorial problems or evaluating reinforcement learning libraries, and what resources or criteria can guide this decision-making process?', 'What are the current advantages and applications of reservoir computers compared to recurrent neural networks or transformers, given the advancements in computational power over the past two decades?', 'What are the key differences between Neuro-Symbolic AI and Transformer AI, and how do these paradigms relate to the development of human-level intelligence (AGI), considering the limitations of current AI models in terms of Turing completeness, ethical concerns, and practical applications?', 'What are the best practices and models for handling time-varying data in machine learning, particularly for tasks like time series forecasting, anomaly detection, and adapting to data drift, while considering challenges such as limited data, feature changes over time, and the need for online learning or continual adaptation?', 'How can I effectively use machine learning models, particularly in computer vision tasks, to handle challenges such as image resizing, feature extraction, dataset preparation, and model selection, while considering issues like class imbalance, limited data, and the need for specific algorithms or architectures for tasks like object detection, image classification, and anomaly detection?']\n",
      "No previous results\n",
      "10\n",
      "[2023, 0, 50, 3]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [05:50<00:00, 116.99s/it]\n",
      "Iterating KMeans: 100%|█████████████████████████████████████████████████████████████████| 3/3 [06:25<00:00, 128.51s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['How can the Dirac delta function be understood as a special case of a probability density distribution with zero variance, and how does this relate to probabilistic reasoning over datasets, particularly in the context of reinforcement learning, diffusion models, and other machine learning frameworks?', 'How can beginners in machine learning understand the differences between key concepts like models, algorithms, and hypotheses, while also exploring the potential and limitations of AI in solving complex problems, such as the P-NP problem, and addressing challenges like adversarial attacks, hallucinations, and ethical considerations in AI applications?', 'How can I effectively modify and optimize machine learning models, such as WGAN-gp for sparse data or LSTM for time series, while addressing issues like training stability, hyperparameter tuning, and resource management, to improve model performance and scalability across different tasks and datasets?', 'Why do transformer models often use an encoder-decoder architecture when a decoder-only model seems capable of performing similar tasks, and how do these architectures handle embeddings, attention mechanisms, and tasks like language translation or next-word prediction differently?', 'How can I effectively utilize large language models (LLMs) for various tasks, such as generating variable-length lists, ensuring accurate and contextually relevant responses, integrating with external data sources, and optimizing performance, while addressing challenges like prompt design, model fine-tuning, and resource constraints?', 'How can neural network architectures be effectively designed and optimized to handle various input types, dimensions, and complexities, such as different input sizes, multiple input channels, and tasks like semantic segmentation, while considering factors like layer initialization, activation functions, and quantization?', 'How do different concepts such as delta, gradient, and error relate to each other in the context of machine learning, particularly in neural networks, and how do they differ in terms of their roles and definitions?', 'How can reinforcement learning algorithms effectively address challenges such as exploration-exploitation trade-offs, partial observability, and sparse or deceptive rewards, while considering the differences in architectures like DQN, PPO, and policy optimization, and the role of state representation, feature scaling, and memory in environments with complex dynamics and uncontrollable variables?', 'How can I effectively control the number of records returned in a similarity search using vector stores like Pinecone or Faiss, and what are the best practices for handling multi-class classification challenges, optimizing k-NN algorithm parameters, and selecting appropriate machine learning algorithms for various tasks such as detecting significant changes in functions or clustering semantic data?', 'What are some effective strategies and considerations for selecting and implementing machine learning models and techniques for various image processing tasks, such as transfer learning, object detection, image segmentation, and adversarial robustness, while addressing challenges like dataset size, model efficiency, and specific application requirements?']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "max_iter = 120\n",
    "emb_type = 'openai'\n",
    "max_batch_size = 1000\n",
    "\n",
    "prompt = \"The following is a cluster of questions from the AI community. Write a single question that represents the cluster of questions:\"\n",
    "text_type = \"Question:\"\n",
    "\n",
    "for year in [2020,2021,2022,2023]:\n",
    "\n",
    "    with open(\"processed_data/data_stackexchange_openai_\" + str(year) + \".pkl\", \"rb\") as f:\n",
    "        data_dict = pickle.load(f) \n",
    "\n",
    "    try:\n",
    "        with open(\"results/sims_stackexchange_results_AI_q_\" + str(year) + \".pkl\", \"rb\") as f:\n",
    "            results_dict = pickle.load(f)\n",
    "        print('Old results_dict loaded')\n",
    "    except:\n",
    "        print('No previous results')\n",
    "        results_dict = {}\n",
    "\n",
    "    data_dict['data']['embeddings'] = list(data_dict['embeddings'])\n",
    "    data_dict['data'] = data_dict['data'][data_dict['data']['Label']=='ai'].copy()\n",
    "    \n",
    "    data_dict['data'] = data_dict['data'].sort_values('CreationDate')\n",
    "    #data_dict['data'] = data_dict['data'][data_dict['data']['Label'].isin(selected_groups)]\n",
    "    \n",
    "    text_data = list(data_dict['data']['Text'])\n",
    "\n",
    "    num_clusters = 10\n",
    "    \n",
    "    print(num_clusters)\n",
    "    \n",
    "    text_features = list(data_dict['data']['embeddings'])\n",
    "\n",
    "    for seed in range(1):\n",
    "        if results_dict.get(seed) is None:\n",
    "            results_dict[seed] = {}\n",
    "\n",
    "        #kLLMmeans\n",
    "        for force_context_length in [50]:\n",
    "            if results_dict[seed].get(force_context_length) is None:\n",
    "                results_dict[seed][force_context_length] = {}\n",
    "                \n",
    "            for max_llm_iter in [3]:\n",
    "\n",
    "                if results_dict[seed][force_context_length].get(max_llm_iter) is None:\n",
    "                    print([year, seed, force_context_length, max_llm_iter])\n",
    "                    summaries, centroids = miniBatchKLLMeans(text_data, \n",
    "                                                            num_clusters,\n",
    "                                                            max_batch_size = max_batch_size, \n",
    "                                                            init = 'k-means++',\n",
    "                                                            prompt = prompt, text_type = text_type,\n",
    "                                                            force_context_length = force_context_length, max_llm_iter = max_llm_iter, \n",
    "                                                            max_iter = max_iter, tol=1e-4, random_state = seed, \n",
    "                                                            emb_type = 'openai', text_features = text_features,\n",
    "                                                            final_iter = True,\n",
    "                                                            initial_iter = True)\n",
    "                    kmeans2 = KMeans(n_clusters=num_clusters, init=centroids, max_iter=1)\n",
    "                    cluster_assignments = kmeans2.fit_predict(text_features)\n",
    "                    \n",
    "                    data_results ={'assignments':cluster_assignments,\n",
    "                                   'summaries':summaries,\n",
    "                                   'final_centroids':centroids}\n",
    "                    \n",
    "                    \n",
    "                    print(summaries[-1])\n",
    "                    results_dict[seed][force_context_length][max_llm_iter] = data_results\n",
    "\n",
    "                    # Save as pkl file\n",
    "                    with open(\"results/sims_stackexchange_results_AI_q_\" + str(year) + \".pkl\", \"wb\") as f:\n",
    "                        pickle.dump(results_dict, f)\n",
    "                        \n",
    "                else:\n",
    "                    data_results = results_dict[seed][force_context_length][max_llm_iter]\n",
    "                    print([year, seed, force_context_length, max_llm_iter])\n",
    "                    print(data_dict['summaries'])\n",
    "\n",
    "                "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
