{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "618d8e48-ccbe-4708-b863-544204062393",
   "metadata": {},
   "source": [
    "## Save Only Vendor Information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b7d68e9-576e-4168-8e92-16ac90360220",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import vt\n",
    "import time\n",
    "import asyncio\n",
    "import nest_asyncio\n",
    "import hashlib\n",
    "import zipfile\n",
    "import logging\n",
    "import aiohttp\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "# Apply nest_asyncio to allow re-entering the event loop\n",
    "nest_asyncio.apply()\n",
    "\n",
    "# Set up logging\n",
    "log_file = 'scan_process.log'\n",
    "logging.basicConfig(\n",
    "    filename=log_file,\n",
    "    filemode='a',\n",
    "    format='%(asctime)s - %(levelname)s - %(message)s',\n",
    "    level=logging.INFO\n",
    ")\n",
    "\n",
    "# Your VirusTotal API key\n",
    "API_KEY = '0cb4de4eb946dbc69fd11f845adca6ed011f86451cf2ca2e69ffd464478026a2'  # Replace with your actual VirusTotal API key\n",
    "\n",
    "# Folder containing extracted files from the provided zip\n",
    "extracted_folder = 'Dataset/data'\n",
    "\n",
    "# CSV file to save results progressively\n",
    "csv_file = 'scan_results.csv'\n",
    "\n",
    "# Path to the zip file\n",
    "zip_file_path = 'Dataset/pypi_malregistry.zip'\n",
    "\n",
    "# Initialize lists to store results and package information\n",
    "results = []\n",
    "package_info = []\n",
    "api_request_count = 0\n",
    "daily_request_limit = 20000  # VirusTotal daily limit\n",
    "start_of_day = datetime.now().date()\n",
    "\n",
    "# Function to wait until the next day at midnight\n",
    "def wait_until_next_day():\n",
    "    now = datetime.now()\n",
    "    next_day = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)\n",
    "    wait_seconds = (next_day - now).total_seconds()\n",
    "    logging.info(f\"Daily limit reached. Waiting until midnight to resume. Waiting {wait_seconds / 3600:.2f} hours.\")\n",
    "    time.sleep(wait_seconds)\n",
    "\n",
    "# Function to recursively get all files in directory\n",
    "def get_all_files(directory):\n",
    "    file_paths = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            file_paths.append(os.path.join(root, file))\n",
    "    return file_paths\n",
    "\n",
    "# Function to compute SHA-256 hash of a file\n",
    "def compute_sha256(file_path):\n",
    "    sha256_hash = hashlib.sha256()\n",
    "    with open(file_path, \"rb\") as f:\n",
    "        for byte_block in iter(lambda: f.read(4096), b\"\"):\n",
    "            sha256_hash.update(byte_block)\n",
    "    return sha256_hash.hexdigest()\n",
    "\n",
    "# Function to save results to CSV progressively\n",
    "def save_results():\n",
    "    df = pd.DataFrame(results)\n",
    "    df.to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False)\n",
    "    results.clear()  # Clear results after saving to avoid duplicates\n",
    "\n",
    "# Extract package names and versions from zip file\n",
    "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
    "    for file in zip_ref.namelist():\n",
    "        if file.endswith('.tar.gz'):\n",
    "            parts = file.split('/')\n",
    "            if len(parts) >= 2:\n",
    "                package_name = parts[-3]\n",
    "                version = parts[-2].replace('.tar.gz', '')\n",
    "                package_info.append((package_name, version))\n",
    "\n",
    "# Sort package information in ascending order by package name\n",
    "package_info = sorted(package_info, key=lambda x: x[0].lower())\n",
    "\n",
    "# Map all files in extracted folder to package_info order\n",
    "all_files = get_all_files(extracted_folder)\n",
    "sorted_files = []\n",
    "for package_name, version in package_info:\n",
    "    for file_path in all_files:\n",
    "        if package_name in file_path and version in file_path:\n",
    "            sorted_files.append((file_path, package_name, version))\n",
    "            break\n",
    "\n",
    "# Asynchronous function to scan files with retry logic for API errors\n",
    "async def scan_files(sorted_files, start_index=0):\n",
    "    global api_request_count, start_of_day\n",
    "    conn = aiohttp.TCPConnector(ssl=False)  # Disable SSL verification\n",
    "    async with vt.Client(API_KEY, connector=conn) as client:  # Use custom connector\n",
    "        for i, (file_path, package_name, version) in enumerate(sorted_files[start_index:], start=start_index):\n",
    "            # Check if the daily limit is reached\n",
    "            if api_request_count >= daily_request_limit:\n",
    "                wait_until_next_day()  # Wait until the next day at midnight\n",
    "                api_request_count = 0  # Reset the request count\n",
    "                start_of_day = datetime.now().date()  # Reset the day start\n",
    "\n",
    "            try:\n",
    "                # Calculate the SHA-256 hash of the file\n",
    "                file_hash = compute_sha256(file_path)\n",
    "                \n",
    "                # Retry logic for API errors with exponential backoff\n",
    "                retries = 3\n",
    "                wait_time = 30\n",
    "                for attempt in range(retries):\n",
    "                    try:\n",
    "                        with open(file_path, 'rb') as f:\n",
    "                            analysis = await client.scan_file_async(f)\n",
    "                        api_request_count += 1  # Increment the request count\n",
    "                        break  # Exit loop if scan is successful\n",
    "                    except vt.error.APIError:\n",
    "                        logging.error(f\"API error on attempt {attempt + 1} for {file_path}. Retrying in {wait_time} seconds...\")\n",
    "                        time.sleep(wait_time)\n",
    "                        wait_time *= 2  # Exponential backoff\n",
    "                else:\n",
    "                    # If retries exhausted, log and mark as failed\n",
    "                    logging.error(f\"Max retries reached for {file_path}. Marking as 'APIError'.\")\n",
    "                    results.append({\n",
    "                        'file_path': file_path,\n",
    "                        'package_name': package_name,\n",
    "                        'version': version,\n",
    "                        'malicious_count': 'APIError',\n",
    "                        'suspicious_count': '',\n",
    "                        'undetected_count': '',\n",
    "                        'harmless_count': '',\n",
    "                        'popular_threat_name': '',\n",
    "                        'popular_threat_category': '',\n",
    "                        'suggested_threat_label': ''\n",
    "                    })\n",
    "                    continue  # Move to the next file if max retries reached\n",
    "\n",
    "                # Wait for the analysis to complete using the file's hash\n",
    "                analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                while analysis.status == 'queued':\n",
    "                    logging.info(f\"Waiting for analysis to complete for {file_path}...\")\n",
    "                    time.sleep(30)\n",
    "                    analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                \n",
    "                # Retrieve the report using the computed hash\n",
    "                file_report = await client.get_object_async(f\"/files/{file_hash}\")\n",
    "                api_request_count += 1  # Increment request count for the retrieval\n",
    "                                \n",
    "                # Extract relevant data\n",
    "                last_analysis_stats = file_report.last_analysis_stats if 'last_analysis_stats' in dir(file_report) else {}\n",
    "                malicious_count = last_analysis_stats.get('malicious', 0)\n",
    "                suspicious_count = last_analysis_stats.get('suspicious', 0)\n",
    "                undetected_count = last_analysis_stats.get('undetected', 0)\n",
    "                harmless_count = last_analysis_stats.get('harmless', 0)\n",
    "\n",
    "                # Check if 'popular_threat_classification' is present\n",
    "                if hasattr(file_report, 'popular_threat_classification'):\n",
    "                    popular_threat_name = ', '.join([threat['value'] for threat in file_report.popular_threat_classification.get('popular_threat_name', [])])\n",
    "                    popular_threat_category = ', '.join([cat['value'] for cat in file_report.popular_threat_classification.get('popular_threat_category', [])])\n",
    "                    suggested_threat_label = file_report.popular_threat_classification.get('suggested_threat_label', '')\n",
    "                else:\n",
    "                    popular_threat_name = ''\n",
    "                    popular_threat_category = ''\n",
    "                    suggested_threat_label = ''\n",
    "\n",
    "                # Append the result\n",
    "                results.append({\n",
    "                    'file_path': file_path,\n",
    "                    'package_name': package_name,\n",
    "                    'version': version,\n",
    "                    'malicious_count': malicious_count,\n",
    "                    'suspicious_count': suspicious_count,\n",
    "                    'undetected_count': undetected_count,\n",
    "                    'harmless_count': harmless_count,\n",
    "                    'popular_threat_name': popular_threat_name,\n",
    "                    'popular_threat_category': popular_threat_category,\n",
    "                    'suggested_threat_label': suggested_threat_label\n",
    "                })\n",
    "                \n",
    "                # Log the successful processing of the file\n",
    "                logging.info(f\"Processed file {file_path} - Package: {package_name}, Version: {version}\")\n",
    "\n",
    "                # Save progress every 5 files\n",
    "                if (i + 1) % 5 == 0:\n",
    "                    logging.info(f\"Saving results for files processed up to index {i}...\")\n",
    "                    save_results()\n",
    "                \n",
    "                # Respect the API rate limit\n",
    "                time.sleep(15)\n",
    "\n",
    "            except Exception as e:\n",
    "                # Log the error\n",
    "                logging.error(f\"Error processing {file_path} - Package: {package_name}, Version: {version}: {e}\")\n",
    "                results.append({\n",
    "                    'file_path': file_path,\n",
    "                    'package_name': package_name,\n",
    "                    'version': version,\n",
    "                    'malicious_count': 'Error',\n",
    "                    'suspicious_count': '',\n",
    "                    'undetected_count': '',\n",
    "                    'harmless_count': '',\n",
    "                    'popular_threat_name': '',\n",
    "                    'popular_threat_category': '',\n",
    "                    'suggested_threat_label': ''\n",
    "                })\n",
    "\n",
    "        # Final save for remaining results after the loop ends\n",
    "        if results:\n",
    "            save_results()\n",
    "            logging.info(\"Final results saved after processing all files.\")\n",
    "\n",
    "# Run the async function to scan files starting from 0\n",
    "loop = asyncio.get_event_loop()\n",
    "loop.run_until_complete(scan_files(sorted_files, start_index=0))\n",
    "\n",
    "print(\"Scanning complete. Results saved to scan_results.csv. Log saved to scan_process.log.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ff11c71-3222-485c-8d61-cd78b752f1ef",
   "metadata": {},
   "source": [
    "## Save all the  data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d37720f1-4ee6-46b2-ba4f-5f62ea52aec0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Waiting for analysis to complete for D:/Final Version/Dataset/new\\1inch\\8.6\\1inch-8.6.tar.gz...\n",
      "Scanning complete. Results saved to scan_results.csv and scan_results.json.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import vt\n",
    "import os\n",
    "import time\n",
    "import asyncio\n",
    "import nest_asyncio\n",
    "import hashlib\n",
    "import json\n",
    "\n",
    "# Apply nest_asyncio to allow re-entering the event loop\n",
    "nest_asyncio.apply()\n",
    "\n",
    "# Your VirusTotal API key\n",
    "API_KEY = '0cb4de4eb946dbc69fd11f845adca6ed011f86451cf2ca2e69ffd464478026a2'\n",
    "\n",
    "# Folder containing extracted files from the provided zip\n",
    "extracted_folder = 'D:/Final Version/Dataset/new'\n",
    "\n",
    "# List to store the results\n",
    "results = []\n",
    "\n",
    "# Function to recursively get all files in directory\n",
    "def get_all_files(directory):\n",
    "    file_paths = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            file_paths.append(os.path.join(root, file))\n",
    "    return file_paths\n",
    "\n",
    "# Function to compute SHA-256 hash of a file\n",
    "def compute_sha256(file_path):\n",
    "    sha256_hash = hashlib.sha256()\n",
    "    with open(file_path, \"rb\") as f:\n",
    "        for byte_block in iter(lambda: f.read(4096), b\"\"):\n",
    "            sha256_hash.update(byte_block)\n",
    "    return sha256_hash.hexdigest()\n",
    "\n",
    "# Asynchronous function to scan files\n",
    "async def scan_files(file_paths):\n",
    "    async with vt.Client(API_KEY) as client:\n",
    "        for file_path in file_paths:\n",
    "            try:\n",
    "                # Calculate the SHA-256 hash of the file\n",
    "                file_hash = compute_sha256(file_path)\n",
    "                \n",
    "                # Upload the file for scanning\n",
    "                with open(file_path, 'rb') as f:\n",
    "                    analysis = await client.scan_file_async(f)\n",
    "                \n",
    "                # Wait for the analysis to complete using the file's hash\n",
    "                analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                while analysis.status == 'queued':\n",
    "                    print(f\"Waiting for analysis to complete for {file_path}...\")\n",
    "                    time.sleep(30)\n",
    "                    analysis = await client.get_object_async(f\"/analyses/{analysis.id}\")\n",
    "                \n",
    "                # Retrieve the report using the computed hash\n",
    "                file_report = await client.get_object_async(f\"/files/{file_hash}\")\n",
    "                \n",
    "                # Convert the entire file report to a dictionary and store it\n",
    "                file_data = file_report.to_dict()\n",
    "                \n",
    "                # Append file path to the data\n",
    "                file_data['file_path'] = file_path\n",
    "                \n",
    "                # Add to results list\n",
    "                results.append(file_data)\n",
    "                \n",
    "                # Respect the API rate limit\n",
    "                time.sleep(15)\n",
    "\n",
    "            except Exception as e:\n",
    "                print(f\"Error processing {file_path}: {e}\")\n",
    "                results.append({\n",
    "                    'file_path': file_path,\n",
    "                    'error': str(e)\n",
    "                })\n",
    "\n",
    "# Get all files in the extracted folder\n",
    "all_files = get_all_files(extracted_folder)\n",
    "\n",
    "# Run the async function to scan files\n",
    "loop = asyncio.get_event_loop()\n",
    "loop.run_until_complete(scan_files(all_files))\n",
    "\n",
    "# Convert results to DataFrame and save to CSV and JSON\n",
    "results_df = pd.DataFrame(results)\n",
    "results_df.to_csv('scan_results.csv', index=False)\n",
    "results_df.to_json('scan_results.json', orient='records', lines=True)\n",
    "\n",
    "print(\"Scanning complete. Results saved to scan_results.csv and scan_results.json.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
