{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "594ab3b4-d849-48a5-abbc-80b0197af892",
   "metadata": {},
   "source": [
    "## HPC Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da753fa3-015e-4f93-9971-c097ebb08202",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import vt\n",
    "import time\n",
    "import asyncio\n",
    "import nest_asyncio\n",
    "import hashlib\n",
    "import logging\n",
    "import aiohttp\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "# Apply nest_asyncio to allow re-entering the event loop\n",
    "nest_asyncio.apply()\n",
    "\n",
    "# Set up logging\n",
    "log_file = 'benign_scan_process.log'\n",
    "logging.basicConfig(\n",
    "    filename=log_file,\n",
    "    filemode='a',\n",
    "    format='%(asctime)s - %(levelname)s - %(message)s',\n",
    "    level=logging.INFO\n",
    ")\n",
    "\n",
    "# VirusTotal API key\n",
    "API_KEY = '0cb4de4eb946dbc69fd11f845adca6ed011f86451cf2ca2e69ffd464478026a2'  # Replace with your actual VirusTotal API key\n",
    "\n",
    "# Folder containing extracted files from the provided directory\n",
    "extracted_folder = 'Benign_Dataset'\n",
    "\n",
    "# CSV file to save results progressively\n",
    "csv_file = 'benign_scan_results.csv'\n",
    "\n",
    "# Initialize lists to store results and package information\n",
    "results = []\n",
    "api_request_count = 0\n",
    "daily_request_limit = 20000  # VirusTotal daily limit\n",
    "start_of_day = datetime.now().date()\n",
    "\n",
    "# Function to wait until the next day at midnight\n",
    "def wait_until_next_day():\n",
    "    now = datetime.now()\n",
    "    next_day = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "    wait_seconds = (next_day - now).total_seconds()\n",
    "    logging.info(f\"Daily limit reached. Waiting until midnight to resume. Waiting {wait_seconds / 3600:.2f} hours.\")\n",
    "    time.sleep(wait_seconds)\n",
    "\n",
    "# Function to recursively get all files in directory\n",
    "def get_all_files(directory):\n",
    "    file_paths = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            file_paths.append(os.path.join(root, file))\n",
    "    return file_paths\n",
    "\n",
    "# Function to compute SHA-256 hash of a file\n",
    "def compute_sha256(file_path):\n",
    "    sha256_hash = hashlib.sha256()\n",
    "    with open(file_path, \"rb\") as f:\n",
    "        for byte_block in iter(lambda: f.read(4096), b\"\"):\n",
    "            sha256_hash.update(byte_block)\n",
    "    return sha256_hash.hexdigest()\n",
    "\n",
    "# Function to save results to CSV progressively\n",
    "def save_results():\n",
    "    df = pd.DataFrame(results)\n",
    "    df.to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False)\n",
    "    results.clear()  # Clear results after saving to avoid duplicates\n",
    "    logging.info(\"Results saved to CSV.\")\n",
    "\n",
    "# Get all files in the extracted folder\n",
    "all_files = get_all_files(extracted_folder)\n",
    "\n",
    "# Asynchronous function to scan files with retry logic for API errors\n",
    "async def scan_files(all_files, start_index=0):\n",
    "    global api_request_count, start_of_day\n",
    "    conn = aiohttp.TCPConnector(ssl=False)  # Disable SSL verification\n",
    "    async with vt.Client(API_KEY, connector=conn) as client:  # Use custom connector\n",
    "        for i, file_path in enumerate(all_files[start_index:], start=start_index):\n",
    "            # Check if the daily limit is reached\n",
    "            if api_request_count >= daily_request_limit:\n",
    "                wait_until_next_day()  # Wait until the next day at midnight\n",
    "                api_request_count = 0  # Reset the request count\n",
    "                start_of_day = datetime.now().date()  # Reset the day start\n",
    "\n",
    "            try:\n",
    "                # Calculate the SHA-256 hash of the file\n",
    "                file_hash = compute_sha256(file_path)\n",
    "                logging.info(f\"Starting scan for {file_path}\")\n",
    "\n",
    "                # Retry logic for API errors with exponential backoff\n",
    "                retries = 3\n",
    "                wait_time = 30\n",
    "                for attempt in range(retries):\n",
    "                    try:\n",
    "                        with open(file_path, 'rb') as f:\n",
    "                            analysis = await client.scan_file_async(f)\n",
    "                        api_request_count += 1  # Increment the request count\n",
    "                        logging.info(f\"Scan started for {file_path}, Request count: {api_request_count}\")\n",
    "                        break  # Exit loop if scan is successful\n",
    "                    except vt.error.APIError:\n",
    "                        logging.error(f\"API error on attempt {attempt + 1} for {file_path}. Retrying in {wait_time} seconds...\")\n",
    "                        time.sleep(wait_time)\n",
    "                        wait_time *= 2  # Exponential backoff\n",
    "                else:\n",
    "                    logging.error(f\"Max retries reached for {file_path}. Marking as 'APIError'.\")\n",
    "                    results.append({\n",
    "                        'file_path': file_path,\n",
    "                        'malicious_count': 'APIError',\n",
    "                        'suspicious_count': '',\n",
    "                        'undetected_count': '',\n",
    "                        'harmless_count': '',\n",
    "                        'popular_threat_name': '',\n",
    "                        'popular_threat_category': '',\n",
    "                        'suggested_threat_label': ''\n",
    "                    })\n",
    "                    continue  # Move to the next file if max retries reached\n",
    "\n",
    "                # Wait for the analysis to complete using the file's hash\n",
    "                analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                while analysis.status == 'queued':\n",
    "                    logging.info(f\"Waiting for analysis to complete for {file_path}...\")\n",
    "                    time.sleep(30)\n",
    "                    analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                \n",
    "                # Retrieve the report using the computed hash\n",
    "                file_report = await client.get_object_async(f\"/files/{file_hash}\")\n",
    "                api_request_count += 1  # Increment request count for the retrieval\n",
    "                                \n",
    "                # Extract relevant data\n",
    "                last_analysis_stats = file_report.last_analysis_stats if 'last_analysis_stats' in dir(file_report) else {}\n",
    "                malicious_count = last_analysis_stats.get('malicious', 0)\n",
    "                suspicious_count = last_analysis_stats.get('suspicious', 0)\n",
    "                undetected_count = last_analysis_stats.get('undetected', 0)\n",
    "                harmless_count = last_analysis_stats.get('harmless', 0)\n",
    "\n",
    "                # Check if 'popular_threat_classification' is present\n",
    "                if hasattr(file_report, 'popular_threat_classification'):\n",
    "                    popular_threat_name = ', '.join([threat['value'] for threat in file_report.popular_threat_classification.get('popular_threat_name', [])])\n",
    "                    popular_threat_category = ', '.join([cat['value'] for cat in file_report.popular_threat_classification.get('popular_threat_category', [])])\n",
    "                    suggested_threat_label = file_report.popular_threat_classification.get('suggested_threat_label', '')\n",
    "                else:\n",
    "                    popular_threat_name = ''\n",
    "                    popular_threat_category = ''\n",
    "                    suggested_threat_label = ''\n",
    "\n",
    "                # Append the result\n",
    "                results.append({\n",
    "                    'file_path': file_path,\n",
    "                    'malicious_count': malicious_count,\n",
    "                    'suspicious_count': suspicious_count,\n",
    "                    'undetected_count': undetected_count,\n",
    "                    'harmless_count': harmless_count,\n",
    "                    'popular_threat_name': popular_threat_name,\n",
    "                    'popular_threat_category': popular_threat_category,\n",
    "                    'suggested_threat_label': suggested_threat_label\n",
    "                })\n",
    "                \n",
    "                # Log the successful processing of the file\n",
    "                logging.info(f\"Processed file {file_path}\")\n",
    "\n",
    "                # Save progress every 5 files\n",
    "                if (i + 1) % 5 == 0:\n",
    "                    logging.info(f\"Saving results for files processed up to index {i}...\")\n",
    "                    save_results()\n",
    "                \n",
    "                # Respect the API rate limit\n",
    "                time.sleep(15)\n",
    "\n",
    "            except Exception as e:\n",
    "                # Log the error\n",
    "                logging.error(f\"Error processing {file_path}: {e}\")\n",
    "                results.append({\n",
    "                    'file_path': file_path,\n",
    "                    'malicious_count': 'Error',\n",
    "                    'suspicious_count': '',\n",
    "                    'undetected_count': '',\n",
    "                    'harmless_count': '',\n",
    "                    'popular_threat_name': '',\n",
    "                    'popular_threat_category': '',\n",
    "                    'suggested_threat_label': ''\n",
    "                })\n",
    "\n",
    "        # Final save for remaining results after the loop ends\n",
    "        if results:\n",
    "            save_results()\n",
    "            logging.info(\"Final results saved after processing all files.\")\n",
    "\n",
    "# Run the async function to scan files starting from 0\n",
    "loop = asyncio.get_event_loop()\n",
    "loop.run_until_complete(scan_files(all_files, start_index=0))\n",
    "\n",
    "print(\"Scanning complete. Results saved to scan_results.csv. Log saved to scan_process.log.\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
