{
  "sims": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Tasks involve searching for specific locations (e.g., restaurants, hotels, parks) with filters like ratings, accessibility, or amenities.",
        "Navigation tasks require generating routes or directions between multiple points (e.g., cities, landmarks, addresses).",
        "Queries demand real-time or dynamic information (e.g., closing hours, traffic conditions, availability).",
        "Tasks include filtering results by temporal constraints (e.g., 'open now,' specific dates, operating hours).",
        "Users need to compare or prioritize results based on criteria like price, ratings, proximity, or user reviews.",
        "Tasks require interacting with location-based services (e.g., maps, GPS coordinates, zip codes) to refine searches.",
        "Queries involve multi-step actions (e.g., search + route planning + extracting details like reviews or pricing).",
        "Tasks focus on identifying points of interest (POIs) near landmarks or geographic features (e.g., museums, trails, airports).",
        "Users seek granular details about businesses or services (e.g., menu items, parking types, accessibility features).",
        "Tasks require parsing structured information (e.g., pricing tiers, reservation systems, transit schedules) from search results."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Tasks require searching for specific points of interest (e.g., restaurants, hotels, parking) with location constraints.",
        "Queries involve filtering results by user-defined criteria like ratings (e.g., \u22654.6 stars), price ranges, or accessibility features.",
        "Navigation tasks frequently include proximity-based searches (e.g., 'closest to,' 'within 2 miles of').",
        "Users seek operational details such as hours of operation, availability status, or real-time traffic conditions.",
        "Route planning between two or more geographic points is a common objective.",
        "Tasks demand extraction of structured information (e.g., reviews, amenities, pricing) from location profiles.",
        "Searches often combine multiple filters (e.g., 'open now + not 24 hours + moderate pricing').",
        "Users target niche amenities (e.g., EV charging, wheelchair accessibility, motorcycle parking).",
        "Tasks involve validating dynamic or contextual data (e.g., traffic delays, seasonal closures, real-time availability).",
        "Queries prioritize actionable outcomes like reservations, directions, or comparative analysis (e.g., 'cheapest,' 'highest rated')."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Tasks involve searching for specific business types (e.g., restaurants, hotels, parking facilities).",
        "Queries require filtering results by user-defined criteria (e.g., ratings, price range, accessibility features).",
        "Tasks demand route planning between explicit start and end locations.",
        "Users seek granular operational details (e.g., hours of operation, review summaries, ticket prices).",
        "Geographic specificity is critical (e.g., zip codes, landmarks, city neighborhoods).",
        "Time-sensitive constraints appear in searches (e.g., 'closes at night,' 'availability on January 11th').",
        "Multi-step navigation is required (e.g., plan trip \u2192 check route \u2192 verify amenities).",
        "User preferences drive queries (e.g., cuisine type, budget tiers, accessibility needs).",
        "Tasks involve cross-referencing multiple attributes (e.g., 'EV charging + proximity to museums').",
        "Results must include contextual metadata (e.g., real-time traffic, weather, amenity availability)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Tasks require using search functionality to find specific locations or services",
        "Navigation involves retrieving directions between two or more points",
        "Tasks demand filtering results by criteria like ratings, price range, or amenities",
        "Queries involve location-based proximity searches (e.g. 'near X', 'within Y miles')",
        "Tasks require checking operating hours or time-sensitive availability",
        "Users need to access detailed place information including reviews and ratings",
        "Tasks involve comparing multiple options (e.g. hotels, restaurants, parking)",
        "Navigation includes accessing accessibility features (e.g. wheelchair accessibility)",
        "Tasks require interaction with booking/reservation systems or price comparisons",
        "Queries involve hybrid use of map exploration and text-based search refinement"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Tasks require searching for specific business types (e.g., restaurants, hotels, parking) with granular criteria (e.g., ratings, hours, proximity).",
        "Navigation tasks involve route planning between multiple points (e.g., addresses, landmarks) using varied transportation modes (e.g., driving, biking).",
        "Queries demand filtering by accessibility features (e.g., wheelchair access, EV charging).",
        "Users seek operational details like hours, availability status ('open now'), or seasonal closures.",
        "Tasks require parsing location-based metadata (e.g., zip codes, coordinates, 'near me' contexts).",
        "Multi-step actions are common: first locating a place, then retrieving auxiliary data (e.g., reviews, routes, pricing).",
        "Users prioritize rating thresholds (e.g., '4.6+ stars', 'highly-rated') as key filters.",
        "Tasks involve validating place-specific amenities (e.g., parking types, menus, reservation options).",
        "Queries frequently target geographic landmarks (e.g., museums, parks, airports) as reference points.",
        "Users cross-reference multiple data points (e.g., combining price range, cuisine, and accessibility for restaurants)."
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks involve searching for repositories with specific criteria (e.g., stars, programming language, update date).",
        "Tasks require locating product-specific pages (e.g., GitHub Copilot, Advanced Security, Pricing).",
        "Tasks include identifying customer stories or case studies (e.g., Duolingo, Mercedes-Benz).",
        "Tasks involve extracting technical details (e.g., features, contributors, commit history).",
        "Tasks require navigating GitHub\u2019s security features (e.g., vulnerabilities, Dependabot, secret scanning).",
        "Tasks include comparing or verifying pricing for GitHub products (e.g., Copilot, Codespaces).",
        "Tasks involve interacting with educational resources (e.g., GitHub Skills courses, documentation).",
        "Tasks require filtering or sorting repositories by popularity (e.g., trending, stars, forks).",
        "Tasks include validating account-related actions (e.g., sign-up, email verification).",
        "Tasks involve exploring compliance and policy details (e.g., terms of service, privacy policies)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks require searching/filtering repositories by criteria (stars, update dates, language, topic).",
        "Tasks involve navigating product feature descriptions (e.g. Copilot capabilities).",
        "Tasks require locating pricing/plan information for GitHub services.",
        "Tasks involve finding security-related content (advisories, Advanced Security features).",
        "Tasks require identifying customer success stories/case studies.",
        "Tasks involve API/documentation lookup (GraphQL, REST, GitHub Actions).",
        "Tasks require comparing different GitHub services/plans (Copilot vs Enterprise).",
        "Tasks involve locating legal/policy information (privacy, data usage, terms).",
        "Tasks require interacting with sign-up/authentication flows.",
        "Tasks involve finding open-source project metadata (contributors, licenses, activity)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks involve searching GitHub repositories using criteria like stars, dates, or programming languages.",
        "Navigation includes locating GitHub Copilot features, pricing, and security details.",
        "Tasks require accessing GitHub's product pages (e.g., Copilot, Advanced Security, Actions).",
        "Users interact with forms (e.g., sign-up, email validation) and authentication workflows.",
        "Tasks involve comparing GitHub plans (e.g., free vs. paid tiers, Copilot plans).",
        "Navigation includes finding customer stories or case studies from listed organizations.",
        "Tasks require parsing technical documentation (e.g., security policies, API usage, CI/CD workflows).",
        "Users explore GitHub educational resources (e.g., GitHub Skills, Classroom, courses).",
        "Tasks involve troubleshooting or interpreting error messages (e.g., push protection, action failures).",
        "Navigation targets GitHub's organizational structure (e.g., repositories, issues, pull requests, contributors)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks require searching/filtering repositories by criteria like stars, date ranges, and programming languages",
        "Tasks involve locating specific product features or documentation (e.g. Copilot capabilities, Advanced Security)",
        "Tasks require price comparison between different GitHub plans and Copilot tiers",
        "Tasks involve account management actions (sign-up, plan upgrades, trial requests)",
        "Tasks require navigation through GitHub's security features and vulnerability databases",
        "Tasks involve finding educational resources (GitHub Skills courses, documentation)",
        "Tasks require comparing enterprise vs individual plan features and limitations",
        "Tasks involve locating customer case studies and success stories",
        "Tasks require API/CLI integration information and developer tool documentation",
        "Tasks involve policy-related information retrieval (privacy, terms, data handling)"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Tasks involve locating repositories with specific criteria (language, stars, update recency)",
        "Navigation requires interaction with GitHub's search functionality and filtering parameters",
        "Tasks focus on extracting feature information from product pages (Copilot, Advanced Security)",
        "Both datasets require understanding GitHub's pricing structure and plan comparisons",
        "Tasks involve locating customer success stories and case studies",
        "Navigation paths require accessing security-related features (vulnerabilities, Dependabot)",
        "Tasks require identifying and interpreting technical documentation (APIs, project management tools)",
        "Both datasets include actions related to account management (sign-up, email verification)",
        "Tasks involve temporal filtering (last updated, creation date) for repository discovery",
        "Navigation requires understanding GitHub's organizational structure (Skills, Trending, Explore sections)"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Tasks require navigating to specific sports sections (e.g., NBA, NFL, NCAA) for scoreboards and game results.",
        "Users frequently seek real-time or final scores of ongoing/completed games across multiple leagues.",
        "Tasks involve retrieving player-specific statistics (e.g., assists, points, career stats) from team/player profiles.",
        "Navigation includes accessing injury reports or roster updates for teams/players.",
        "Tasks require interaction with league standings, rankings, or playoff brackets (e.g., Eastern Conference standings).",
        "Users often locate news articles or updates about trades, MVP races, or team performance analyses.",
        "Tasks involve filtering schedules by date (e.g., 'yesterday's matchups') or team-specific calendars.",
        "Navigation includes search functionality to find teams/players by name or keyword (e.g., 'Los Angeles teams').",
        "Tasks require accessing fantasy sports tools (e.g., Tournament Challenge brackets, BPI rankings).",
        "Users interact with multimedia content (e.g., game highlights, ESPN+ streams, radio broadcasts) via embedded links."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Tasks involve navigating to sports scores and game results across multiple leagues (NBA, NFL, NCAA, etc.).",
        "Users frequently access detailed player statistics (e.g., points, assists, career games played).",
        "Navigation requires filtering content by specific leagues, teams, or sports categories.",
        "Tasks include retrieving real-time or final game outcomes and in-progress updates.",
        "Users interact with standings, rankings, and playoff/bracket information (e.g., NBA standings, NCAA brackets).",
        "Navigation involves accessing fantasy sports sections (e.g., Tournament Challenge, fantasy leagues).",
        "Tasks require locating team-specific information (rosters, injuries, depth charts).",
        "Users utilize search functionality to find specific games, players, or news articles.",
        "Navigation includes accessing schedules (e.g., NFL Week 17, college football bowl games).",
        "Tasks involve cross-referencing betting odds, game predictions, or broadcast details."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Both datasets involve retrieving real-time or recent game scores across multiple sports leagues (e.g., NBA, NFL, NCAA).",
        "Tasks require accessing detailed player/team statistics (e.g., points, assists, standings, career stats).",
        "Navigation includes locating league-specific sections (e.g., NBA standings, NCAA Football Scoreboard, EPL schedule).",
        "Users must identify team abbreviations and contextualize them within scores/game summaries (e.g., LAC, GS, NYR).",
        "Tasks frequently involve filtering information by date ranges or specific events (e.g., Week 18 NFL scores, Christmas Day games).",
        "Both datasets require accessing playoff/bracket information (e.g., CFP schedule, March Madness brackets).",
        "Users interact with ESPN+ features to find streaming schedules or exclusive content.",
        "Tasks involve comparing scores/stats across teams, players, or seasons (e.g., standings comparisons, MVP polls).",
        "Both include retrieving injury reports, roster details, or depth charts for specific teams.",
        "Navigation often requires synthesizing multi-step data (e.g., cross-referencing schedules with scores or player stats)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Tasks involve retrieving real-time or recent sports scores and game results across leagues like NBA, NFL, NCAA, and NHL.",
        "Users navigate to player/team statistics pages to access career stats, game performance metrics, or seasonal data.",
        "Navigation includes accessing conference standings, division rankings, and playoff scenarios (e.g., AFC/NFC tiebreakers, NCAAF standings).",
        "Tasks require interaction with ESPN+ content, including live streams, original shows, or subscription-based features.",
        "Users search for news articles, trade updates, or injury reports within sport-specific sections (e.g., NBA trades, NFL free agency).",
        "Navigation paths include ESPN\u2019s 'Fantasy' section for sports analytics, rankings, or fantasy league tools.",
        "Tasks involve locating multimedia content such as podcasts, video highlights, or ESPN Radio broadcasts.",
        "Users utilize filtering or sorting to compare teams/players (e.g., rebounds leaders, heaviest infielders, roster comparisons).",
        "Navigation requires understanding ESPN\u2019s event categorization (e.g., bowl games, Final Four, Copa del Rey) and schedules.",
        "Tasks frequently involve cross-referencing multiple ESPN subdomains (e.g., ESPNFC for soccer, espnW for women\u2019s sports)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Tasks require retrieving real-time or recent game scores across multiple sports leagues (e.g., NBA, NFL, NCAA).",
        "Users frequently access player-specific statistics (e.g., assists, points, career games played) from team/player profiles.",
        "Navigation involves league-specific sections (NBA, NHL, NFL, NCAA) with structured scoreboards and standings.",
        "Tasks demand locating team standings, rankings, and win-loss records within conferences/divisions.",
        "Both datasets include queries for injury reports and player availability statuses (e.g., Philadelphia 76ers' injuries).",
        "Users often check schedules for upcoming/past games, including dates, times, and TV broadcasts.",
        "Tasks require identifying tournament brackets (e.g., NCAA Final Four) and playoff scenarios across sports.",
        "Navigation includes accessing ESPN+ content summaries, streaming details, or subscription features.",
        "Users search for news/articles on trades, MVP candidates, and team strategies (e.g., NFL free agency updates).",
        "Interaction with game summaries (e.g., highlights, quarter-by-quarter scores) is essential for task completion."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Tasks require locating models/datasets by name, modality, or application domain",
        "Tasks involve extracting metadata (e.g., update dates, download counts, metrics) from resource pages",
        "Tasks require navigating documentation for libraries like Transformers, PEFT, or Safetensors",
        "Tasks involve comparing/contrasting multiple models based on performance characteristics",
        "Tasks require interacting with API endpoints or inference examples",
        "Tasks involve verifying licensing information or usage restrictions",
        "Tasks require finding/using search functionality to discover resources",
        "Tasks involve following links between models, datasets, and related research papers",
        "Tasks require identifying appropriate model architectures for specific NLP/CV tasks",
        "Tasks involve navigating between organizational profiles and their contributed resources"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Tasks require locating specific machine learning models (e.g., NLP, text-to-image) with filters like modality, language, or update date",
        "Users must identify metadata attributes like licenses, download counts, or last-updated timestamps for models/datasets",
        "Tasks involve navigating documentation to find API usage instructions (e.g., Trainer class, PEFT adapters, inference endpoints)",
        "Price comparison and feature analysis of paid services (Pro accounts, GPU compute, enterprise solutions) are required",
        "Search functionality is critical for filtering models/datasets by name, task, or performance metrics",
        "Tasks require dataset exploration including content inspection and format verification through Dataset Viewer",
        "License compliance verification is needed for commercial/research use of models and datasets",
        "Performance benchmarking analysis is required through evaluation metrics reported in model cards",
        "Multi-modal support exploration needed (text, image, 3D, audio) across different model categories",
        "Community interaction patterns emerge through forum navigation, space interactions, and model discussion participation"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Tasks require locating specific ML models by functionality (e.g., translation, sentiment analysis)",
        "Navigation involves filtering models/datasets by recency criteria (last updated within specific timeframes)",
        "Users must extract quantitative metrics from content (download counts, GitHub stars, model sizes)",
        "Tasks require interaction with technical documentation (APIs, libraries like PEFT/Transformers)",
        "Both involve finding enterprise-related information (pricing tiers, security features, support plans)",
        "Tasks necessitate identifying model architecture details (base models, training frameworks, precision formats)",
        "Navigation patterns include cross-referencing models with associated Spaces/demo applications",
        "Users must verify licensing information and usage restrictions for models/datasets",
        "Tasks require comparing performance metrics between similar models (evaluation scores, benchmarks)",
        "Both datasets involve community interaction features (model comments, dataset viewer analysis, Space interactions)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Tasks require navigating through models, datasets, and spaces sections to locate specific AI/ML resources.",
        "Users are expected to filter and identify resources based on criteria like modality (text, image, audio), update timestamps, and popularity metrics.",
        "Tasks involve extracting detailed metadata from model/dataset cards (e.g., download counts, last updated dates, evaluation metrics).",
        "Both datasets emphasize interacting with documentation to find API usage examples (e.g., Trainer API, PEFT adapters, diffusers).",
        "Tasks require comparing commercial vs. open-source offerings (e.g., Pro account features vs. community models).",
        "Users must handle multilingual content filtering (e.g., English/Japanese models, multilingual datasets).",
        "Tasks involve error resolution scenarios (e.g., 'Task not found' errors, deployment configuration issues).",
        "Both require identifying trending/newly released resources through timestamps and sorting mechanisms.",
        "Tasks demand cross-referencing between models, datasets, and associated Spaces demos/applications.",
        "Users must parse technical specifications like model architectures (Llama, Gemma), training frameworks (PyTorch, Keras), and quantization methods (4bit/8bit)."
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Tasks involve locating specific models/datasets by name or functionality (e.g., sentiment analysis, text generation)",
        "Navigation requires interaction with model/dataset metadata like update dates, download counts, and usage metrics",
        "Users must identify technical implementation details from documentation (e.g., Trainer API parameters, PEFT adapters)",
        "Tasks require comparing/ranking models by popularity metrics (download counts, trending status, GitHub stars)",
        "Activities involve parsing structured model cards with standardized attribute formats (creator, framework, license)",
        "Tasks require navigating between related resources (models \u2194 spaces \u2194 datasets \u2194 documentation)",
        "Users must extract specific numerical values from interface elements (pricing costs, model parameters, upvote counts)",
        "Activities involve temporal filtering (find models updated within specific date ranges)",
        "Tasks require understanding enterprise features vs community offerings (Pro accounts vs free tier capabilities)",
        "Navigation patterns involve recursive exploration (model \u2192 related papers \u2192 implementation \u2192 dataset)"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Tasks involve searching for courses by specific technical or business-related topics (e.g., data science, Python, AI, finance).",
        "Users filter courses by skill level granularity (e.g., beginner, intermediate) across both datasets.",
        "Tasks require extracting structured metadata like course duration, instructor names, and module-specific content details.",
        "Interest in career-aligned credentials (e.g., Professional Certificates from IBM/Google) appears in both datasets.",
        "Both include queries for university partnerships (e.g., Yale, University of Michigan, Stanford) and industry collaborations.",
        "Users compare course attributes like ratings (e.g., 4.7+), time commitments, and specialization components.",
        "Tasks target practical skill acquisition (e.g., Python programming, Excel, SQL) with tool-specific requirements.",
        "Queries involve validating course prerequisites, credit eligibility, and alignment with academic pathways.",
        "Both datasets include exploration of multi-course Specializations with sequenced learning objectives.",
        "Users seek verification of career outcomes (e.g., salary data, job availability metrics) tied to course completions."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Both datasets require users to filter courses by skill level (e.g., Beginner/Intermediate)",
        "Navigation tasks in both involve searching for courses with specific duration ranges (e.g., 1-4 weeks, 1-3 months)",
        "Both require identification of courses/programs from specific institutions (e.g., Yale, Google, IBM)",
        "Tasks in both datasets involve verifying course ratings (4+ stars) and enrollment statistics",
        "Both require extraction of detailed metadata including instructor names, course descriptions, and learning outcomes",
        "Navigation patterns involve checking credential types (Professional Certificates/Specializations/Degrees)",
        "Both require interaction with course prerequisites and recommended experience levels",
        "Tasks in both datasets involve comparing multiple course options within technical domains (Data Science, Cybersecurity, etc.)",
        "Both require identification of free vs. paid course offerings and associated value propositions",
        "Navigation flows in both datasets involve exploring career outcomes and job relevance of courses"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Both datasets require users to search/filter courses by specific skill levels (e.g., beginner/intermediate).",
        "Both include tasks involving identification of course duration parameters (e.g., weeks/hours to complete).",
        "Both require extraction of institution/company partnerships (e.g., universities from specific countries).",
        "Both involve finding courses with explicit rating thresholds (e.g., 4.7+ stars).",
        "Both require matching courses to technical skill requirements (e.g., Python, Agile, IoT).",
        "Both include tasks to identify credential types (Professional Certificates/Specializations) from specific providers.",
        "Both require verification of course content details like module structures or video counts.",
        "Both involve cross-referencing instructor information with multiple course offerings.",
        "Both require identification of free course offerings within catalog sections.",
        "Both include tasks to compare/contrast program formats (e.g., Degrees vs. Certificates vs. individual courses)"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Both datasets require users to filter/search courses by skill level (beginner/intermediate/advanced)",
        "Tasks in both datasets involve identifying courses with specific duration ranges (e.g., 1-4 weeks, 1-3 months)",
        "Both require understanding of course structure components (modules, videos, assessments)",
        "Users need to compare credentials across multiple providers (Google, IBM, universities) in both",
        "Both involve extracting detailed metadata: ratings, enrollment numbers, and completion times",
        "Tasks require navigation through hierarchical taxonomies (Subject > Specialization > Course > Module)",
        "Both datasets contain queries about instructor credentials and institutional affiliations",
        "Users must interpret career outcome data (salary figures, job availability statistics) in both",
        "Both require handling multiple course formats (Projects, Specializations, Professional Certificates, Degrees)",
        "Tasks involve cross-referencing partner institutions with geographic locations/countries"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Tasks require filtering courses/programs by skill level (Beginner/Intermediate)",
        "Navigation involves searching for specific course durations (e.g. 1-4 weeks, 1-3 months)",
        "Users frequently seek professional certificates from recognized providers (Google/IBM/Meta)",
        "Queries demand identification of institution partnerships (e.g. Yale, Imperial College London)",
        "Tasks require extracting detailed metadata: instructor names, ratings (4+ stars), and enrollment stats",
        "Both datasets emphasize career-oriented credentials with salary/job availability metrics",
        "Navigation patterns include exploring free course sections and 100% free filters",
        "Users compare degree programs with progress tracking features (credit eligibility)",
        "Tasks involve verifying AI/ML integration in course content and skill development",
        "Both require parsing multi-step program structures (Specializations \u2192 Courses \u2192 Projects)"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Tasks involve searching for papers using specific categories/subfields (e.g., Astrophysics of Galaxies, Chaotic Dynamics)",
        "Users need to filter/search by submission dates (e.g., 'last day', 'within last week')",
        "Navigation requires interacting with hierarchical subject lists (e.g., Physics > Condensed Matter > Mesoscale Physics)",
        "Tasks require accessing paper metadata: abstracts, authors, submission dates, and versions (e.g., 'v3 submitted')",
        "Users must utilize arXiv identifiers or specific paper titles for precise searches",
        "Advanced search features are needed for date ranges, multi-field queries, or abstract/keyword filters",
        "Tasks involve differentiating between categories with similar names (e.g., Statistics vs. Statistics Theory)",
        "Navigation includes accessing HTML-formatted paper content alongside PDF versions",
        "Users must parse category-specific URLs (e.g., /list/astro-ph.GA/recent) for recent submissions",
        "Tasks require understanding arXiv's subject taxonomy (e.g., cs.CL for Computation & Language, q-fin for Quantitative Finance)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Tasks require navigation through hierarchical academic subject categories and subcategories",
        "Search functionality includes field-specific filters (title, author, abstract) and date ranges",
        "Users frequently access recent submissions through 'new' and 'recent' category links",
        "Paper retrieval requires handling arXiv identifiers and specific document formats (PDF/HTML)",
        "Tasks involve abstract reading and section-specific content extraction from papers",
        "Advanced search capabilities are needed for complex queries combining multiple parameters",
        "Category browsing patterns follow standardized arXiv subject taxonomy structure",
        "Tasks require cross-referencing between paper metadata and author information",
        "Navigation includes accessing help resources and submission guidelines",
        "Users frequently interact with specialized subject codes (e.g., astro-ph.GA, cs.CL) for precise searches"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Both datasets require navigation through hierarchical academic subject categories and subcategories",
        "Tasks involve searching for papers using specific arXiv identifiers (e.g. astro-ph.GA, cs.CL)",
        "Users need to locate and interpret date-filtered results (recent/new submissions within time windows)",
        "Both require interaction with paper metadata including version history and submission dates",
        "Tasks involve finding and extracting specific content from paper sections (abstracts, introductions, results)",
        "Navigation requires understanding of arXiv's category-specific \"recent\" and \"search\" link patterns",
        "Both datasets include operations with arXiv's advanced search parameters and filters",
        "Tasks involve cross-referencing between category listings and individual paper records",
        "Users must distinguish between different document formats (PDF, HTML) for content extraction",
        "Both require handling of complex academic queries combining multiple search criteria (author + topic + date ranges)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Both datasets require searching for academic papers using specific subject categories (e.g., 'astro-ph.GA', 'cs.CL', 'stat.ML')",
        "Tasks involve filtering results by recency (e.g., 'last day', 'last week', 'recent')",
        "Navigation includes accessing abstracts and specific sections (e.g., Introduction, Methodology) of papers",
        "Users must locate and interact with search filters (e.g., date ranges, author names, field-specific queries)",
        "Tasks require distinguishing between document formats (e.g., HTML, PDF) for paper retrieval",
        "Both involve hierarchical navigation through subject archives (e.g., Physics \u2192 Condensed Matter \u2192 Quantum Gases)",
        "Queries demand cross-category searches (e.g., machine learning in Statistics section, astrophysics with computational methods)",
        "Tasks require handling technical metadata (e.g., arXiv identifiers, version history, submission dates)",
        "Both datasets include citation/reference tracking within papers (e.g., '10th reference', journal references)",
        "Navigation involves supplemental content like privacy policies, help pages, and institutional acknowledgments"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Both datasets require navigation through hierarchical subject categories (e.g., Physics > Astrophysics of Galaxies)",
        "Tasks involve retrieving recent papers filtered by timeframes (e.g., last day/week)",
        "Users must interact with search bars supporting field-specific queries (e.g., title, author, abstract)",
        "Navigation includes accessing subcategory-specific lists (e.g., astro-ph.GA for Astrophysics of Galaxies)",
        "Tasks require parsing paper metadata (e.g., submission dates, author lists, version history)",
        "Abstract extraction is a common objective across tasks",
        "Both involve filtering by arXiv identifier categories (e.g., cs.CL, quant-ph)",
        "Users must locate specialized search interfaces (e.g., advanced search, form interface)",
        "Tasks require understanding arXiv's versioning system (e.g., v3 submission dates)",
        "Both datasets involve cross-referencing between primary categories and subfields (e.g., Computer Science > Machine Learning)"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Both datasets require navigation through categorized content sections (e.g., News, Sport, Business, Culture).",
        "Tasks involve locating articles with timestamps indicating recency (e.g., '3 hrs ago', '1 day ago').",
        "Regional sections (e.g., Middle East, Asia, Europe) are critical for task completion in both datasets.",
        "Multimedia content (e.g., videos, images) is referenced in tasks requiring visual or interactive elements.",
        "Hierarchical navigation menus (e.g., headers, submenus) structure access to subsections like 'War in Ukraine' or 'Innovation'.",
        "Tasks demand parsing article summaries or headlines to extract key details (e.g., event locations, author names).",
        "Both datasets include 'Most Read' or 'Most Watched' lists for prioritizing high-traffic content.",
        "Search functionality is implied for tasks requiring keyword-based retrieval (e.g., 'climate change', 'Trump tariffs').",
        "Dynamic content updates (e.g., live coverage, breaking news) are relevant for time-sensitive tasks.",
        "Topic-specific tags (e.g., 'Business', 'World') are used to filter articles by subject across both datasets."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks require navigating through categorized sections (e.g., Sport, Business, Culture) to locate specific content types like articles or videos.",
        "Users must parse time-sensitive metadata (e.g., '3 hrs ago', '2 days ago') to identify recent updates or events.",
        "Tasks involve extracting key details from headlines, summaries, or captions within structured article previews.",
        "Geographic tags (e.g., Asia, Europe) are critical for filtering region-specific news across both datasets.",
        "Multimedia content discovery (e.g., videos, image galleries) is integral to tasks in both datasets.",
        "Hierarchical navigation from broad sections (e.g., World News) to subsections (e.g., Middle East) is consistently required.",
        "Tasks demand identifying event outcomes (e.g., sports scores, conflict updates) from time-stamped content.",
        "Search for niche topics (e.g., climate impacts, celebrity profiles) relies on menu traversal or keyword usage.",
        "Structured data extraction (e.g., locations, dates, involved entities) is necessary across all tasks.",
        "Differentiation between live updates, analysis pieces, and reports is required to fulfill task objectives."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Both datasets require navigation through categorized sections (e.g., News, Sport, Business) to locate task-relevant articles.",
        "Tasks in both datasets involve identifying timestamped content (e.g., 'hrs ago') to verify recency of information.",
        "Geographical categorization (e.g., Middle East, Europe, Asia) is used in both datasets for region-specific queries.",
        "Both include live updates (marked as 'LIVE') for real-time event tracking in tasks.",
        "Articles in both datasets feature descriptive headlines and summaries to support content comprehension for summarization tasks.",
        "Multimedia content (images/videos) is embedded in articles across both datasets for visual context.",
        "Hierarchical navigation structures (e.g., sub-sections under 'World News') are present in both for topic refinement.",
        "Search functionality (via 'Search BBC' button) is implied in tasks requiring keyword-based queries in both datasets.",
        "Both datasets use standardized metadata tags (e.g., 'US & Canada', 'Science & Environment') for topic filtering.",
        "Footer navigation mirrors primary categories in both datasets, enabling consistent section access across pages."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Tasks require navigating through categorized sections (e.g., News, Sport, Business) to locate content",
        "Users must identify time-sensitive information (e.g., 'latest', 'recent', 'within the last two days')",
        "Summarization of key insights from articles/reports is a common objective",
        "Tasks involve locating region-specific content (e.g., Middle East, Asia, Europe)",
        "Multimedia elements like videos are included in information retrieval objectives",
        "Global current affairs focus (e.g., wars, natural disasters, political developments)",
        "Economic/business impact analysis is required in multiple tasks",
        "Navigation includes hierarchical structures (e.g., subsections like 'BBC InDepth', 'Most Watched')",
        "Sports-related tasks demand checking schedules, results, or league standings",
        "Cultural content exploration (e.g., book reviews, film/art highlights) is present in both datasets"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks require navigating through categorized sections (e.g., Sports, Business, Culture) to locate information.",
        "Users must identify time-sensitive content (e.g., 'latest,' 'recent,' 'within the last two days').",
        "Tasks involve parsing article metadata such as timestamps, geographical tags, and category labels.",
        "Navigation includes accessing region-specific content (e.g., Middle East, Europe, Asia).",
        "Users interact with hierarchical structures (main sections \u2192 subsections \u2192 articles).",
        "Tasks require distinguishing between news formats (e.g., live updates, analyses, videos).",
        "Queries demand summarization of key insights from articles or reports.",
        "Users must identify multimedia elements (e.g., videos, images) embedded in articles.",
        "Tasks involve cross-referencing content across thematic categories (e.g., war, climate, politics).",
        "Navigation includes filtering content by tags like 'BBC InDepth' or 'Most Read.'"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Tasks involve searching for specific products using detailed filters (e.g., price range, material, size, ratings).",
        "Users frequently compare prices across multiple search results.",
        "Navigation tasks require filtering by product attributes (e.g., condition, color, brand).",
        "Tasks emphasize verifying customer reviews and star ratings (e.g., 4+ stars).",
        "Users check availability of free shipping or return policies.",
        "Product condition filters (e.g., 'Used - Good', 'pre-owned') are commonly used.",
        "Tasks involve navigating hierarchical categories (e.g., Home & Kitchen, Electronics).",
        "Users seek discounts, promotions, or items under specific price thresholds.",
        "Tasks include purchasing intent (e.g., adding to cart, checkout steps).",
        "Users prioritize item availability in specific configurations (e.g., size, color, capacity)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks involve searching for products with specific attributes (e.g., size, price range, material).",
        "Users are required to apply filters (e.g., condition, rating, price) to narrow search results.",
        "Tasks include comparing prices or features across multiple products.",
        "Navigation requires interacting with product categories (e.g., Electronics, Home & Kitchen).",
        "Users must locate and validate return/delivery policies for specific items.",
        "Tasks often involve adding items to cart or managing cart contents.",
        "Actions depend on parsing customer reviews or ratings (e.g., '4+ stars').",
        "Users need to identify product availability (e.g., free shipping, in-stock status).",
        "Tasks require navigating hierarchical menus (e.g., category \u2192 subcategory \u2192 product).",
        "Queries demand handling dynamic content like sale items or time-sensitive deals."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks require filtering products by specific attributes (e.g., price range, size, material, condition).",
        "Navigation involves searching for items within defined categories (e.g., electronics, home goods, fashion).",
        "Users must validate product availability (e.g., stock status, shipping options, color/size variants).",
        "Tasks frequently include price comparison across multiple results or sellers.",
        "Actions involve checking customer review thresholds (e.g., 4+ stars, 500+ reviews).",
        "Multi-step workflows are common (e.g., search \u2192 filter \u2192 compare \u2192 add to cart).",
        "Users are required to interpret product specifications (e.g., dimensions, technical features, compatibility).",
        "Tasks emphasize time-sensitive criteria (e.g., new arrivals, seasonal items, publication year).",
        "Both datasets include interactions with policy details (return policies, warranty terms, Prime benefits).",
        "Users must navigate hierarchical menus (e.g., department \u2192 subcategory \u2192 product attributes)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks involve searching for specific products with detailed filters (price, size, material, rating).",
        "Tasks require checking product availability (color, shipping options, stock status).",
        "Users need to compare prices across multiple search results.",
        "Tasks involve adding items to cart or saving preferred products.",
        "Tasks require navigating through category-specific sections (e.g., electronics, home essentials).",
        "Users must verify customer review thresholds (e.g., 4+ stars).",
        "Tasks include filtering by deal types (e.g., 'Used - Good', 'on sale', 'discounted').",
        "Users are instructed to identify product specifications (e.g., capacity, dimensions, battery life).",
        "Tasks involve checking return/delivery policies or warranty information.",
        "Tasks require locating items under budget constraints with explicit price ranges."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Both datasets require users to perform product searches using specific attributes (e.g., price range, material, size, or customer ratings).",
        "Tasks involve filtering results based on price constraints (e.g., under $50, $50\u2013$100).",
        "Users must verify product availability details like color, size, or shipping options (e.g., free shipping).",
        "Both include actions to compare or evaluate multiple products (e.g., price comparisons, review analysis).",
        "Tasks require navigating category-specific sections (e.g., electronics, home goods, fashion).",
        "Users interact with promotional or sale-based filters (e.g., Winter Sale, discounted items).",
        "Product reviews and star ratings are critical for decision-making in both datasets.",
        "Tasks involve checking return policies or warranty information (e.g., free returns, protection plans).",
        "Both datasets emphasize locating products with niche features (e.g., hypoallergenic, water-resistant).",
        "Users execute post-search actions (e.g., adding to cart, saving results, purchasing) after criteria are met."
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Tasks require mathematical computation and problem-solving",
        "Tasks involve scientific data queries (e.g., physics, chemistry, engineering)",
        "Tasks request step-by-step solutions or procedural explanations",
        "Tasks include unit conversions and measurements",
        "Tasks compare multiple entities, methods, or datasets",
        "Tasks utilize Wolfram Alpha for domain-specific computations (e.g., integrals, derivatives, chemical equations)",
        "Tasks focus on real-world applications (e.g., health, finance, energy)",
        "Tasks require statistical analysis or probability calculations",
        "Tasks involve algebraic, calculus, or differential equations",
        "Tasks seek data retrieval from academic or technical domains (e.g., economics, astronomy, materials science)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Tasks require computational problem-solving and mathematical calculations",
        "Queries involve unit conversions and dimensional analysis",
        "Use of Wolfram Alpha for scientific data retrieval (e.g., element properties, compound details)",
        "Requests for step-by-step solutions to equations or integrals",
        "Focus on physics, chemistry, and engineering concepts (e.g., projectile motion, stoichiometry)",
        "Tasks demand statistical analysis (e.g., standard deviation, variance)",
        "Queries about real-world applications (e.g., mortgage calculations, calorie estimation)",
        "Need for domain-specific knowledge in mathematics (e.g., Riemann Hypothesis, beta distribution)",
        "Exploration of Wolfram Alpha's curated datasets (e.g., GDP, historical trends)",
        "Use of natural language input to trigger computational workflows (e.g., \"solve 4x + 3 = 19\")"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Tasks require computational problem-solving involving mathematical equations and scientific formulas",
        "Queries involve unit conversions and measurement calculations across disciplines like physics, chemistry, and nutrition",
        "Tasks demand data retrieval from specialized domains (e.g., astronomy, thermodynamics, finance)",
        "Requests require comparisons between multiple entities, methods, or datasets (e.g., packing densities, food calories)",
        "Tasks utilize step-by-step solutions for mathematical operations (integration, derivatives, equation solving)",
        "Queries involve statistical analysis (distributions, standard deviation, probability calculations)",
        "Tasks focus on temporal calculations (date intervals, event timing, decay/half-life)",
        "Requests require interpretation of natural language inputs into structured computational queries",
        "Tasks involve academic/research-oriented topics (mathematical paradoxes, Fibonacci sequence applications)",
        "Queries leverage domain-specific curated data (chemical compound properties, celestial event data, economic metrics)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Tasks require computational or mathematical problem-solving using Wolfram Alpha's engine.",
        "Queries involve scientific domains such as physics, chemistry, and engineering.",
        "Natural language input is utilized for interpreting complex technical questions.",
        "Tasks demand structured data retrieval (e.g., chemical properties, stock prices, historical metrics).",
        "Unit conversions and dimensional analysis are common requirements.",
        "Step-by-step solutions or derivations are explicitly requested (e.g., integrals, equation solving).",
        "Comparison of methodologies or entities (e.g., packing densities, economic factors) is required.",
        "Real-world applications in finance, health, or engineering are central to tasks.",
        "Queries span interdisciplinary topics (e.g., climate models, linguistic etymology, energy production).",
        "Tasks leverage Wolfram-specific functions or datasets (e.g., ProductLog, Fermi-Dirac distribution)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Tasks require mathematical computations such as solving equations, derivatives, integrals, and polynomial operations.",
        "Tasks involve scientific data retrieval including element properties, chemical compounds, and physics-related calculations.",
        "Tasks utilize Wolfram Alpha's specialized knowledge base for step-by-step solutions and expert-level answers.",
        "Tasks include unit conversions and measurements such as time intervals, energy outputs, and nutritional values.",
        "Tasks demand numerical precision with requirements for significant figures, scientific notation, and decimal places.",
        "Tasks involve real-world applications like financial calculations (mortgage, GDP) and health metrics (BMI, metabolic rates).",
        "Tasks require comparative analysis between different entities, models, or methods (e.g., packing densities, climate studies).",
        "Tasks focus on natural language input parsing for computational queries (e.g., 'Calculate the mass of Jupiter compared to Earth').",
        "Tasks include statistical computations such as variance, mean, and distribution analysis.",
        "Tasks involve structured data extraction from Wolfram Alpha's curated datasets (e.g., astronomical events, historical data, material properties)."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Tasks require filtering recipes by user ratings (e.g., 4 stars or higher).",
        "Users need to search for recipes using ingredient-specific criteria (e.g., zucchini, chicken breast).",
        "Navigation involves filtering by dietary preferences (e.g., vegetarian, keto, low-carb).",
        "Tasks demand checking preparation/cooking time constraints (e.g., under 30 minutes).",
        "Users interact with recipe metadata (e.g., calories per serving, nutrition facts).",
        "Navigation includes accessing user-generated reviews and ratings for validation.",
        "Tasks involve browsing recipe categories (e.g., dinners, desserts, holidays).",
        "Users need to locate recipes with specific meal types (e.g., breakfast, brunch).",
        "Tasks require parsing recipe titles/descriptions for ingredient inclusion (e.e.g., quinoa, cranberries).",
        "Navigation includes saving/printing recipes or generating ingredient shopping lists."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks require filtering recipes by user ratings (e.g., minimum 4 stars).",
        "Queries specify preparation/cooking time constraints (e.g., <30 minutes).",
        "Dietary preferences are explicitly mentioned (e.g., vegetarian, keto, low-carb).",
        "Nutritional requirements are prioritized (e.g., calorie count, carb content).",
        "Users seek recipe saving/bookmarking functionality for later access.",
        "Occasion-specific recipe searches are common (e.g., holidays, seasonal events).",
        "Comparative analysis of recipes is required (e.g., 'best version' queries).",
        "Ingredient-driven searches are frequent (e.g., chicken breast, quinoa).",
        "User reviews and ratings influence recipe selection decisions.",
        "Navigation involves structured categories (e.g., cuisines, meal types, ingredients)."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Tasks require filtering recipes by user ratings (e.g., 4+ stars) in both datasets.",
        "Users must locate recipes with specific dietary constraints (e.g., vegetarian, keto, gluten-free) in both datasets.",
        "Navigation involves searching for recipes by ingredient (e.g., chicken breast, quinoa) in both datasets.",
        "Tasks require extracting preparation/cooking time details from recipe pages in both datasets.",
        "Users must parse and compare user reviews/ratings (e.g., 50+ reviews) to evaluate recipe quality in both datasets.",
        "Navigation includes accessing nutritional information (e.g., calories, carbs) from recipe pages in both datasets.",
        "Tasks involve saving/bookmarking recipes (e.g., 'Save Recipe' button interactions) in both datasets.",
        "Users must navigate category hierarchies (e.g., Dinners > Chicken > Quick Meals) to find recipes in both datasets.",
        "Tasks require identifying recipe variations (e.g., zucchini lasagna vs. traditional lasagna) in both datasets.",
        "Navigation includes seasonal/holiday-specific recipe sections (e.g., Christmas desserts) in both datasets."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Both datasets involve user tasks focused on recipe discovery with specific constraints (ratings, prep time, ingredients).",
        "Search functionality with filters for recipe attributes (cook time, calories, dietary needs) is central to navigation in both.",
        "User-generated content elements (ratings, reviews) are critical for decision-making across tasks in both datasets.",
        "Recipe categorization by meal type (dinner, breakfast) and cuisine (Italian, Greek) is consistently utilized.",
        "Navigation paths include browsing through curated lists (Popular Recipes, Trending Now) in both datasets.",
        "Tasks frequently require accessing detailed recipe pages with nutritional information and serving sizes in both.",
        "Both datasets emphasize seasonal/holiday-specific content (Easter, Christmas) in user tasks.",
        "User interaction patterns include saving/favoriting recipes and creating shopping lists in both datasets.",
        "Multi-step navigation (search \u2192 filter \u2192 compare reviews) is fundamental to task completion in both.",
        "Content organization around community-driven features (Home Cook testimonials, Allstars) supports task contexts in both."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks involve searching for recipes by specific ingredients (e.g., chicken breast, quinoa, zucchini).",
        "Users filter recipes by preparation time constraints (e.g., under 30 minutes, less than 1 hour).",
        "Recipes are evaluated based on user ratings (e.g., 4+ stars, 50+ reviews).",
        "Navigation includes dietary preferences (e.g., vegetarian, low-carb, vegan).",
        "Tasks require identifying recipes for specific occasions (e.g., holidays, parties, seasonal events).",
        "Users seek nutritional information (e.g., calories per serving, carbohydrate content).",
        "Navigation includes meal-type categorization (e.g., dinners, appetizers, desserts).",
        "Tasks involve accessing community-generated content (e.g., reviews, ratings, home cook testimonials).",
        "Users explore trending/popular recipe sections (e.g., 'Fresh Picks', 'The Latest').",
        "Navigation includes saving/bookmarking recipes for later use."
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Both datasets require users to look up word definitions with UK and US pronunciations provided.",
        "Tasks involve navigating to specific sections (e.g., Grammar, Thesaurus, Translate) via structured menus.",
        "Users must locate example sentences for queried words within dictionary entries.",
        "Tasks include comparing regional pronunciation differences (e.g., UK vs. US English).",
        "Navigation requires interacting with search bars and dropdown language/dictionary selectors.",
        "Users explore grammar rules (e.g., passive voice, articles, comparative adjectives) through dedicated sections.",
        "Tasks involve accessing translation features (e.g., English\u2013French, English\u2013Chinese) via directional toggles.",
        "Both datasets include tasks related to word games (e.g., Word Scramble) under the '+Plus' section.",
        "Users must browse alphabetical indexes (A-Z) to navigate dictionary entries.",
        "Tasks require identifying synonyms/antonyms or multiple meanings of words through cross-referenced content."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Both datasets require navigation through dictionary, grammar, and thesaurus sections for task completion.",
        "Tasks in both datasets involve retrieving word definitions, pronunciations (UK/US), and example sentences.",
        "Both include translation functionalities between English and multiple languages (e.g., French, Spanish, Chinese).",
        "Pronunciation guides with International Phonetic Alphabet (IPA) notation are available for words in both datasets.",
        "Grammar-related tasks in both datasets focus on rules (e.g., articles, comparative adjectives, modal verbs) with usage examples.",
        "Both feature interactive elements like the Word Scramble game in the '+Plus' section for language practice.",
        "Tasks require users to compare linguistic variations (e.g., British vs. American English pronunciation or grammar).",
        "Both include blog content and 'Word of the Day' sections for extended vocabulary exploration.",
        "Navigation to language-specific dictionaries (e.g., Learner\u2019s Dictionary, Essential British/American English) is common to both.",
        "Both datasets involve cookie/privacy banners and social media sharing buttons as part of the interface."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks require searching for word definitions, including multiple meanings where applicable.",
        "Pronunciation retrieval in both UK and US English, often with IPA notation.",
        "Utilization of example sentences to understand word usage.",
        "Navigation to grammar sections for rules and usage examples.",
        "Use of translation features for converting words to/from other languages.",
        "Accessing the Plus section for additional resources like games (e.g., Word Scramble).",
        "Interaction with blog content for explanations of phrases and new words.",
        "Browsing dictionary sections (e.g., A-Z index) to locate words.",
        "Comparing different language directions (e.g., English-French) in translations.",
        "Dynamic content interaction such as word games or new word entries."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Tasks require searching for word definitions with multiple meanings/senses",
        "Navigation involves accessing UK/US pronunciation guides with IPA notations",
        "Example sentence lookup is a core component of word entries",
        "Grammar section usage for understanding language rules (articles, comparatives, passive voice)",
        "Translation functionality between multiple languages (e.g. English-Chinese, English-Spanish)",
        "Word game interaction present in Plus section (e.g. Word Scramble)",
        "Structured browsing through alphabetical dictionary indexes (A-Z listings)",
        "Requires navigation between different dictionary types (Learner's, Essential, Thesaurus)",
        "Social media sharing capabilities for word entries and blog content",
        "Multi-format content presentation (definitions, phonetics, blog posts, word lists)"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Both datasets require users to perform dictionary word lookups with definitions and example sentences",
        "Tasks involve accessing pronunciation guides with IPA notation for UK/US English variants",
        "Navigation to grammar sections for explanations of linguistic rules (e.g., articles, adjectives) is required",
        "Both include translation functionality between multiple languages (e.g., English-Spanish, English-Chinese)",
        "Users must locate and interpret synonyms/antonyms through the Thesaurus feature",
        "Tasks require interaction with phonetic transcription elements and audio pronunciation buttons",
        "Both datasets involve exploring word usage differences (e.g., fewer vs less, UK vs US English)",
        "Navigation through hierarchical content structure (dictionary\u2192grammar\u2192translations) is essential",
        "Tasks utilize search functionality with dictionary selection filters (e.g., learner's dictionary)",
        "Both require understanding of cross-referenced content between dictionary entries and grammar guides"
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Tasks require navigating product specification pages (e.g., iPhone models, MacBook specs)",
        "Price comparison between device models/versions is a core task objective",
        "Requires interaction with product configuration tools (storage, connectivity options)",
        "Tasks involve locating technical details like chip types, camera specs, or display resolutions",
        "Requires navigation through hierarchical product categories (e.g., iPhone > Pro models)",
        "Tasks demand accessing both marketing content and technical support documentation",
        "Requires understanding of Apple's product taxonomy (Pro vs. Air vs. standard models)",
        "Tasks involve cross-referencing between product pages and accessory compatibility information",
        "Requires navigation through purchase pathways including customization and checkout options",
        "Tasks necessitate finding time-sensitive information (newest models, latest OS compatibility)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks require navigating to product specification pages (e.g., storage capacity, processor details, camera features)",
        "Price comparison workflows for different product configurations/models are central to user intents",
        "Device comparison tasks between current and previous generations exist in both datasets",
        "Trade-in value lookup functionality is required for device upgrade assessments",
        "Support documentation navigation for account management (e.g., Apple ID recovery) appears in both",
        "Accessory compatibility checks (e.g., Smart Folio, Apple Pencil) are present in tasks",
        "In-store availability checks and pickup scheduling requirements exist in both datasets",
        "Educational/business purchase programs require specific navigation paths in tasks",
        "Feature compatibility verification across device/OS versions is needed (e.g., iOS updates)",
        "Environmental impact disclosures and corporate responsibility information are search targets"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks require navigating to product-specific pages for detailed specifications (e.g., iPhone, MacBook, Apple Watch).",
        "Users must compare features, prices, or configurations across multiple Apple product models or generations.",
        "Tasks involve identifying technical details like chip types (e.g., M3, A18 Pro), storage capacities, or display specifications.",
        "Price calculation tasks require interacting with configuration tools for storage, connectivity, or accessory options.",
        "Navigation includes accessing support resources (e.g., Apple ID recovery, parental controls, repair pricing).",
        "Tasks require locating trade-in value estimators or understanding trade-in program conditions.",
        "Users must identify product availability details (e.g., in-store pickup options, regional restrictions).",
        "Tasks involve exploring educational discounts or business/enterprise purchasing programs.",
        "Navigation includes cross-referencing product environmental reports or sustainability commitments.",
        "Tasks require distinguishing between subscription services (e.g., Apple Fitness+, Apple TV+) and hardware purchases."
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Tasks require locating product specifications (e.g., storage capacity, display size, chip type).",
        "Navigation involves comparing multiple product models or variants (e.g., iPhone 16 Pro vs. Pro Max).",
        "Users frequently seek price calculations for customized configurations (e.g., storage upgrades, accessories).",
        "Tasks involve identifying purchasing workflows (e.g., checkout steps, trade-in integration).",
        "Enterprise/Business use-case navigation is present (e.g., bulk purchases, business plans).",
        "Support-related queries require accessing troubleshooting guides or repair costs (e.g., Apple ID recovery).",
        "Product feature verification is common (e.g., camera specs, health sensors, software compatibility).",
        "Tasks demand interaction with device customization tools (e.g., color selection, engraving).",
        "Users navigate through hierarchical product categories (e.g., Store > iPhone > Accessories).",
        "Footers/legal documentation retrieval is required for policies (e.g., environmental reports, trade-in terms)."
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks require navigating product specification pages to find technical details like processor type, storage capacity, and device dimensions",
        "Users must interact with product comparison features to evaluate different models (e.g., iPhone generations, MacBook configurations)",
        "Price checking activities necessitate accessing customized configuration interfaces with storage/accessory options",
        "Support-related tasks require finding authentication recovery processes and warranty management systems",
        "Trade-in value lookup functionality is essential for calculating device upgrade costs",
        "Product availability checks require integration with location-based store inventory systems",
        "Accessory compatibility verification demands cross-referencing product-specific compatibility lists",
        "Camera feature analysis requires deep navigation through technical photography specifications",
        "Business/productivity solutions research involves accessing specialized enterprise-oriented content sections",
        "Battery optimization and device maintenance tasks require accessing technical support knowledge bases"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks require retrieving real-time or current information (e.g., news, stock prices, sports scores).",
        "Navigation involves structured search interfaces with filters or autocomplete features.",
        "Tasks demand comparative analysis (e.g., product prices, recipe variations, stock performance).",
        "Procedural task execution (e.g., scheduling meetings, following tutorials, configuring settings).",
        "Interaction with multimedia content (e.g., images, videos, tutorials) is implied for task completion.",
        "Tasks rely on extracting factual data (e.g., specifications, statistics, scientific measurements).",
        "User customization of search parameters (e.g., location-based results, language settings) is required.",
        "Navigation targets domain-specific content (e.g., tech, healthcare, sports, academia).",
        "Multi-step navigation (e.g., search \u2192 filter \u2192 verify \u2192 extract) is essential for task success.",
        "Tasks depend on external website integration (e.g., Allrecipes, IMDb, GitHub, job boards)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Tasks require extracting specific numerical data or metrics (e.g., player counts, distances, rankings).",
        "Queries involve retrieving current or real-time information (e.g., latest news, stock prices, recent events).",
        "Tasks demand multi-step navigation (e.g., search, locate, verify, compare).",
        "Focus on structured data formats like rankings, lists, or charts (e.g., top destinations, weekly music charts).",
        "Need to identify and synthesize information from diverse content types (e.g., news articles, technical specifications, bios).",
        "Tasks often involve external websites or platforms (e.g., IMDb, GitHub, recipe blogs).",
        "Require parsing technical or domain-specific details (e.g., hardware requirements, scientific terms, financial metrics).",
        "Emphasis on comparative analysis (e.g., stock prices, SEO strategies, language differences).",
        "Tasks may include user interactions beyond passive retrieval (e.g., form submissions, data pasting, account actions).",
        "Queries target verifiable factual answers rather than subjective or opinion-based results."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Tasks require formulating search queries to retrieve specific information from search engines",
        "Tasks involve navigating through search results to locate external websites or platforms (e.g., Steam, IMDb, GitHub, YouTube)",
        "Tasks demand extracting precise numerical or factual data (e.g., dates, rankings, statistics, definitions)",
        "Tasks focus on current or time-sensitive information (e.g., latest news, recent research, real-time metrics)",
        "Tasks necessitate parsing structured content like lists, tables, or charts for answers",
        "Tasks include multi-step actions (e.g., search \u2192 copy \u2192 paste \u2192 verify)",
        "Tasks target domain-specific knowledge across diverse fields (e.g., sports, science, entertainment, technology)",
        "Tasks require comparative analysis (e.g., stock performance, differences between languages)",
        "Tasks involve transactional objectives (e.g., job applications, ticket bookings, newsletter sign-ups)",
        "Tasks leverage advanced search features (e.g., date filters, category-specific queries, autocomplete predictions)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Tasks require information retrieval via search engine queries (e.g., news, statistics, definitions).",
        "Queries often involve real-time or up-to-date data (e.g., latest news, current prices, live scores).",
        "Tasks demand navigating to specific subpages or sections (e.g., product pages, venue details, event listings).",
        "Multi-step actions are common (e.g., search, filter, compare, then extract data).",
        "Focus on fact-based answers (e.g., dates, numbers, names, rankings, specifications).",
        "Tasks require parsing structured content (e.g., tables, charts, search results, product descriptions).",
        "Use of comparative or evaluative criteria (e.g., ratings, price ranges, eligibility requirements).",
        "Location-specific parameters are frequent (e.g., \"near me,\" city-based searches, regional availability).",
        "Tasks involve technical or domain-specific terminology (e.g., SEO, stock prices, medical terms, hardware specs).",
        "Time-bound constraints appear (e.g., \"latest,\" \"released in 2024,\" \"posted in the last 3 days\")."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Tasks require locating specific information through search engines or website navigation.",
        "Queries often involve retrieving numerical data (e.g., rankings, counts, prices, statistics).",
        "Tasks necessitate navigating to dedicated website sections (e.g., news, product pages, event listings).",
        "User intents focus on real-time or up-to-date information (e.g., latest news, current metrics).",
        "Tasks demand comparative analysis (e.g., stock prices, language differences, product models).",
        "Queries target specific entities (e.g., individuals, teams, movies, locations, products).",
        "Instructions include multi-step actions (e.g., search, extract, validate, and report).",
        "Tasks involve technical or educational content (e.g., software setup, AI principles, academic programs).",
        "User goals require parsing structured data (e.g., charts, tables, lists, rankings).",
        "Tasks rely on external websites or platforms (e.g., news outlets, repositories, e-commerce sites)."
      ]
    }
  },
  "diffs_synth_from_real": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Dataset B tasks frequently require making reservations or bookings (e.g., hotel rooms, restaurant tables), while Dataset A focuses purely on search and navigation without booking actions.",
        "Dataset B tasks explicitly request pricing details for specific products/services (e.g., hotel rooms, meal plans), whereas Dataset A focuses on pricing tiers as filters rather than direct price extraction.",
        "Dataset B includes queries for specific menu items (e.g., 'Crispy Chicken Sandwich'), while Dataset A only requests general menu availability.",
        "Dataset B tasks combine temporal constraints with multi-criteria prioritization (e.g., 'open Sunday + Italian + moderate price + proximity'), while Dataset A uses temporal constraints as standalone filters.",
        "Dataset B requires parsing user reviews for specific attributes (e.g., 'reviews mentioning shakes'), whereas Dataset A only requires identifying review scores or reading general review content.",
        "Dataset B includes explicit guest count/group size parameters for accommodations (e.g., '2 guests'), which never appear in Dataset A tasks.",
        "Dataset B tasks specify transportation modes as requirements for route planning (e.g., 'utilizes train'), while Dataset A only requires generic route generation.",
        "Dataset B contains tasks requiring comparison of features across multiple POIs (e.g., 'compare playground slides'), unlike Dataset A's single-POI focus.",
        "Dataset B includes event-based temporal constraints (e.g., 'New Year's Day reservation'), while Dataset A uses only generic time/date filters.",
        "Dataset B tasks explicitly seek medical/healthcare service qualifications (e.g., 'hospitals with good maternal care'), a specificity absent in Dataset A."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset B tasks frequently require booking or reservation actions (e.g., hotel stays, restaurant tables).",
        "Dataset B tasks explicitly specify budget constraints (e.g., \"$400 for 2 nights\").",
        "Dataset B tasks prioritize price comparisons or price extraction (e.g., hotel rates, product plans).",
        "Dataset B tasks involve viewing user-generated content (e.g., photos, reviews) as a primary objective.",
        "Dataset B tasks include time-specific parameters (e.g., start dates, multi-night stays, days of operation).",
        "Dataset B tasks emphasize accommodation policies (e.g., free cancellation, reservation modifications).",
        "Dataset B tasks target medical-specific accessibility needs (e.g., medical transportation, wheelchair product searches).",
        "Dataset B tasks require online ordering capabilities (e.g., restaurants with ordering options).",
        "Dataset B tasks request aggregated metrics (e.g., average ratings, deforestation rates) rather than granular details.",
        "Dataset B tasks focus on multi-step travel planning (e.g., combining hotel bookings with transportation schedules)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset B tasks frequently involve international or cross-city locations (e.g., Paris hotels, Tokyo restaurants) while A focuses on specific domestic locations",
        "Dataset B requires combination of service features with time/availability constraints (e.g., 'hotels with pool + specific date') where A combines physical attributes (e.g., 'EV charging + proximity')",
        "Dataset B contains explicit property transaction tasks (e.g., 'buy tickets', 'make reservations') absent in A's information-focused queries",
        "Dataset B includes multi-leg accessibility requirements (e.g., 'wheelchair accessible route with multiple stops') while A focuses on single destination accessibility",
        "Dataset B tasks emphasize comparative analysis between options (e.g., 'compare hotels') where A focuses on absolute attribute filtering",
        "Dataset B requires integration of tourism-specific metadata (e.g., attraction tickets, park trail details) unlike A's business/service focus",
        "Dataset B contains explicit trip planning duration parameters (e.g., '4-6 day stay') while A uses immediate time constraints",
        "Dataset B features relative location descriptors (e.g., 'near driving route', 'along the way') where A uses absolute positional references",
        "Dataset B includes reservation system interactions (e.g., 'make reservation') absent in A's information retrieval tasks",
        "Dataset B tasks require parsing layered recreational information (e.g., 'hike details + attractions + photos') where A focuses on discrete business attributes"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Dataset B tasks require booking/reservation systems with explicit date ranges and guest numbers, while Dataset A focuses on general availability checks without date-specific parameters.",
        "Dataset B includes tasks involving international locations (e.g., Paris, Mumbai, Buenos Aires), whereas Dataset A focuses primarily on domestic US locations.",
        "Dataset B tasks frequently involve price comparisons with specific monetary thresholds (e.g., $160/night), while Dataset A uses relative terms like 'price range' without exact figures.",
        "Dataset B requires interaction with multimedia content (e.g., 360\u00b0 views, photo galleries) as part of place exploration, unlike Dataset A which focuses on textual/practical information retrieval.",
        "Dataset B contains tasks requiring combined filters (e.g., 'open now + specific cuisine + price + rating'), while Dataset A typically uses single or dual filter criteria.",
        "Dataset B includes tasks requesting historical/geographical context (e.g., environmental threats to regions), while Dataset A focuses exclusively on real-time/navigation-related information.",
        "Dataset B tasks explicitly reference user-generated content creation (e.g., 'be prepared to write a review'), whereas Dataset A only requires consumption of existing reviews.",
        "Dataset B involves flight search functionality, which is absent in Dataset A tasks focused on ground transportation and local services.",
        "Dataset B requires analysis of transportation schedules/routes (e.g., public transit timing), while Dataset A focuses on static route directions between points.",
        "Dataset B tasks specify brand-name businesses (e.g., 'Pacific Catch', 'Name. Design Agency'), while Dataset A uses generic category-based searches (e.g., 'plumbers', 'beauty salons')."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Dataset B tasks more frequently involve reservations with specific dates/times (e.g., 'January 1, 2025 brunch')",
        "Dataset B requires price comparisons (e.g., 'best price', 'cheapest French brunch')",
        "Dataset A includes technical actions like map printing/sharing (e.g., 'print map as PDF', 'generated sharing link')",
        "Dataset B emphasizes itinerary planning (e.g., 'day trip itinerary', 'suggest bike-friendly stops')",
        "Dataset A tasks require parsing hierarchical location data (e.g., 'which level has least proportion in reviews')",
        "Dataset B explicitly requests user experience evaluations (e.g., 'check out their menu', 'read comfort food reviews')",
        "Dataset A contains more granular parking specifications (e.g., 'motorcycle parking', 'EV charging supported parking')",
        "Dataset B uses budget qualifiers like 'affordable' and 'moderately priced' as primary filters",
        "Dataset A tasks demand review analysis (e.g., 'check what a one-star review says')",
        "Dataset B includes temporal event-based searches (e.g., 'New Year's Eve hotel', 'specific date reservations')"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in dataset B focus on comparing multiple pricing plans (e.g., Copilot free vs. paid) while dataset A focuses on verifying individual product pricing",
        "Dataset B requires direct analysis of security compliance certifications (e.g., SOC 2) whereas dataset A focuses on general security feature navigation",
        "Dataset B includes tasks about educational resource enrollment (e.g., signing up for courses) while dataset A focuses on locating existing educational content",
        "Dataset B contains explicit comparisons of feature availability across plans (e.g., Copilot features in Team vs Enterprise) absent in dataset A",
        "Tasks in dataset B specifically target vulnerability severity filtering (e.g., high-severity GHSA advisories) while dataset A focuses on general vulnerability discovery",
        "Dataset B requires understanding of account creation policies (e.g., privacy policy during signup) whereas dataset A focuses on account validation post-creation",
        "Tasks in dataset B involve third-party service integration costs (e.g., SAP pricing) not present in dataset A",
        "Dataset B includes analysis of project management workflows (e.g., task tracking setup) while dataset A focuses on repository contribution patterns",
        "Dataset B tasks require direct engagement with legal/IP documentation (e.g., Copilot's IP implications) absent in dataset A",
        "Dataset B contains historical incident analysis (e.g., past service outages) while dataset A focuses on current status checks"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks in B require locating specific security advisory severity levels (e.g. high-severity)",
        "Tasks in B involve retrieving compliance certifications (e.g. CSA STAR Certificate)",
        "Tasks in B require finding implementation-specific technical details (e.g. YAML code examples)",
        "Tasks in B focus on data usage specifics for AI features (Copilot's personal data processing)",
        "Tasks in B require accessing enterprise sales processes (quote requests for Advanced Security)",
        "Tasks in B involve finding installation instructions for specific packages/components",
        "Tasks in B require identifying job/career information (open positions at GitHub)",
        "Tasks in B focus on translation/localization of content (e.g. translating pages to Spanish)",
        "Tasks in B require direct interaction with UI animations/demonstrations (pausing demo videos)",
        "Tasks in B emphasize API technical comparisons (REST vs GraphQL implementation differences)"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks in B require explicit analysis of security compliance standards (e.g., GDPR, data protection)",
        "B includes tasks focused on account lifecycle management (creation, trials, cancellation)",
        "B tasks involve direct comparison of paid tiers' enterprise-level features",
        "B requires interpreting technical implementation details (CLI setup, action configurations)",
        "Tasks in B specifically target understanding data retention policies",
        "B includes troubleshooting scenarios requiring error message interpretation in CI/CD workflows",
        "Tasks in B demand analysis of business/enterprise-specific Copilot usage terms",
        "B requires navigation through legal/terms documentation for compliance verification",
        "Tasks in B involve configuration of educational/classroom-specific settings",
        "B includes exploration of API usage metrics and developer activity analytics"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks in B specifically inquire about enterprise solution pricing for specialized projects (e.g., NLP)",
        "B requires identifying release dates for product features (e.g., GitHub Copilot)",
        "B includes requests for educational/enterprise trial eligibility and sign-up processes",
        "B tasks involve retrieving specific vulnerability identifiers (e.g., CVE-2024-51988)",
        "B requires understanding custom API/GraphQL integrations beyond general documentation",
        "B tasks focus on project management best practices using GitHub Issues",
        "B includes requests about autograding implementations in GitHub Classroom",
        "B requires locating compliance certifications (e.g., CSA STAR Certificate)",
        "B tasks involve third-party extension pricing (e.g., Hyperlint AI for Copilot)",
        "B includes app store rating checks and specific repository contribution processes"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Dataset B tasks focus on GitHub product setup/configuration (Copilot enrollment, project creation) while A focuses on repository discovery/analysis",
        "B requires navigation through GitHub's marketing/pricing pages for product comparisons (Copilot plans) whereas A focuses on repository search/filtering",
        "Tasks in B involve detailed exploration of GitHub's security advisories (CVE lookup) while A focuses on general security feature usage",
        "B contains explicit tasks for comparing technical API implementations (REST vs GraphQL) while A focuses on API documentation consumption",
        "Dataset B includes organizational account management tasks (team plan upgrades) not present in A's individual account focus",
        "B requires interaction with GitHub's project management features (task lists, board layouts) absent from A's repository-centric tasks",
        "Tasks in B demand understanding of GitHub's data policies (Copilot training data) while A focuses on code/content extraction",
        "B contains troubleshooting scenarios (project creation errors) not present in A's straightforward information retrieval tasks",
        "Dataset B includes license-specific repository filtering (GNU licenses) while A uses general language/star-based filtering",
        "B tasks require navigation through GitHub's educational resources (Skills courses) for feature implementation versus A's course discovery focus"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Dataset A tasks focus on retrieving historical game results and player statistics from past events (e.g., 'yesterday's matchups'), while Dataset B emphasizes real-time/live game tracking and upcoming schedules.",
        "Dataset B tasks require interaction with betting odds and fantasy sports predictions (e.g., 'NFL Week 17 betting odds'), which are absent in Dataset A's requirements.",
        "Dataset A includes granular player physical attributes (e.g., 'heaviest weight among infielders'), whereas Dataset B focuses on team/player performance metrics in active competitions.",
        "Dataset B tasks involve navigation through conference-specific college sports rankings (e.g., NCAA bowl season) not prominently featured in Dataset A.",
        "Dataset A requires comparison of historical game statistics (e.g., 'loser high vs. winner high'), while Dataset B prioritizes current standings/playoff brackets.",
        "Dataset B tasks include international soccer league navigation (e.g., UEFA Conference League, Serie A) beyond Dataset A's primary North American sports focus.",
        "Dataset A emphasizes career-spanning player statistics retrieval, while Dataset B focuses on single-season/event performance analysis (e.g., '2024 NFL season stats').",
        "Dataset B requires identification of broadcast/streaming platforms for live events (e.g., 'where to watch Premier League'), absent in Dataset A's tasks.",
        "Dataset A tasks involve complex filtering of schedules by specific statistical outcomes, while Dataset B uses temporal filters for upcoming/pending games.",
        "Dataset B includes roster impact analysis from injuries (e.g., 'lineup changes post-injury'), whereas Dataset A focuses on basic injury report retrieval."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Dataset B tasks involve NCAA Football (NCAAF) game results and bowl game navigation, absent in Dataset A",
        "Dataset B includes cross-sport comparison tasks (e.g., simulating NBA trades with soccer players), while A focuses on single-sport analysis",
        "Dataset B requires navigation of NFL playoff scenarios/playoff machines, whereas A focuses on regular season standings",
        "Dataset B tasks reference international soccer transfers/free agents, while A remains focused on North American leagues",
        "Dataset B contains explicit requests for fantasy baseball/hockey content, while A emphasizes basketball-focused fantasy tools",
        "Dataset B tasks require comparing odds across sports (NCAAF vs NFL), while A focuses on single-sport betting details",
        "Dataset B includes requests for season-long player performance tracking (2024 NFL season), while A focuses on career/game-specific stats",
        "Dataset B tasks require navigation between college and professional sports sections, while A maintains league-specific boundaries",
        "Dataset B contains soccer-specific broadcast information requests (Premier League matches), absent in Dataset A",
        "Dataset B includes podcast/audio content navigation (ESPN Radio NBA podcasts), not present in Dataset A tasks"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset B tasks focus more on college football (NCAAF) and soccer (EPL), while Dataset A covers a broader range of sports including NBA, NHL, and MLB.",
        "Dataset B includes more requests for postseason-specific data (e.g., College Football Playoff brackets, NFL Playoff Machine) compared to Dataset A's general playoff/bracket navigation.",
        "Tasks in Dataset B frequently involve future-oriented information (e.g., 2024 NFL schedules, bowl game predictions), whereas Dataset A emphasizes recent/past game data retrieval.",
        "Dataset B tasks require accessing betting odds and fantasy sports rankings, which are not prominently featured in Dataset A's samples.",
        "Dataset B navigation often includes logistical details (e.g., TV channels for matches), while Dataset A focuses on ESPN+ streaming schedules for live content.",
        "Dataset B tasks involve retrieving historical player performance metrics (e.g., past season stats), whereas Dataset A emphasizes real-time career stats and cross-season comparisons.",
        "Dataset B contains more granular NFL season navigation (e.g., Week 18 scores, team-specific schedules) compared to Dataset A's general NFL scoreboard tasks.",
        "Dataset B prioritizes college football bowl games and CFP content, while Dataset A includes NCAA Basketball Tournament Challenges (March Madness).",
        "Dataset B tasks require direct team-vs-team comparisons (e.g., Cavaliers vs. Thunder stats), whereas Dataset A involves broader standings comparisons across conferences.",
        "Dataset B includes explicit requests for event-specific schedules (e.g., 2024 Christmas Day games), while Dataset A filters by date ranges without fixed holiday events."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Tasks in B focus more on college football (NCAAF) bowl games, CFP brackets, and specific team schedules, while A emphasizes NBA, NHL, and international soccer events like Copa del Rey.",
        "Dataset B includes tasks requiring retrieval of historical player career stats (e.g., Michael Jordan's Wizards tenure), whereas A focuses on real-time or recent player/team metrics.",
        "B involves accessing ESPN+ for live streams of specific games (e.g., Buffalo Sabres vs. Blues), while A\u2019s ESPN+ tasks relate to general content like shows or features.",
        "B requires navigating to NCAA basketball (NCAAM/NCAAW) game results and team performance, while A includes NCAA softball/baseball.",
        "Tasks in B demand comparisons of NFL team rosters (e.g., Chiefs vs. Cardinals), whereas A involves filtering/sorting player/team stats (e.g., rebounds leaders).",
        "B includes requests for NFL playoff tiebreaker scenarios (e.g., AFC/NFC seeding), while A covers conference standings without explicit tiebreaker navigation.",
        "Dataset B tasks involve retrieving box scores and game-specific statistics (e.g., Spurs vs. Nets), whereas A focuses on seasonal standings or aggregated stats.",
        "B requires locating college football transfer news (e.g., Real Madrid to Arsenal), while A\u2019s news tasks center on NBA trades or NFL free agency.",
        "B tasks include accessing ESPN Radio for sports podcasts, while A\u2019s multimedia tasks target highlights or broadcasts without podcast emphasis.",
        "Dataset B emphasizes team schedule navigation (e.g., Brentford\u2019s upcoming matches), while A prioritizes event categorization (e.g., Final Four schedules)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Tasks in dataset B focus more on NFL-specific queries including team standings, player performance, and playoff scenarios, while dataset A covers a broader range of sports leagues (e.g., NBA, NHL, NCAA, MLB).",
        "Dataset B includes tasks related to fantasy sports insights (e.g., 'fantasy baseball-related information'), whereas dataset A does not explicitly mention fantasy sports interactions.",
        "Dataset B contains queries about specific future or historical dates (e.g., 'January 4, 2025', 'Week 16'), while dataset A emphasizes real-time or recent game data without date-specific constraints.",
        "Tasks in dataset B involve strategic scenarios like creating hypothetical trades (e.g., 'Create an NBA trade between teams') or calculating playoff chances, which are absent in dataset A.",
        "Dataset B includes requests for draft-related information (e.g., '2024 NBA draft results'), whereas dataset A focuses on in-season statistics and standings.",
        "Dataset B requires navigation to league-specific tools (e.g., 'NFL Playoff Machine'), while dataset A emphasizes ESPN+ content summaries and subscription features.",
        "Dataset B includes esports-related queries, whereas dataset A tasks are strictly tied to traditional sports leagues.",
        "Dataset B tasks frequently reference college football bowl games and postseason scenarios (e.g., 'CFB Bowl Games'), while dataset A focuses on regular-season tournaments like NCAA Final Four.",
        "Dataset B requires retrieving player-specific performance metrics for niche roles (e.g., 'Deuce Vaughn\u2019s NFL stats'), whereas dataset A emphasizes broader player statistics (e.g., 'assists, points').",
        "Dataset B tasks involve cross-sport navigation (e.g., switching from college football to NFL scores), while dataset A tasks are confined to single-sport sections per query."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Dataset A tasks frequently require extracting precise numerical metadata (e.g., download counts, GitHub stars) while Dataset B emphasizes broader resource discovery (e.g., tutorials, community posts)",
        "Dataset B tasks more commonly involve finding/accessing educational materials (courses, tutorials) compared to Dataset A",
        "Dataset A tasks focus more on temporal recency constraints (e.g. 'last updated in March 2023') than Dataset B",
        "Dataset B contains tasks requiring interaction with community forums/support channels not prominent in Dataset A",
        "Dataset A tasks more frequently specify commercial/professional use cases (enterprise pricing, API deployment) compared to Dataset B",
        "Dataset B includes more tasks requiring format conversion/export functionality (e.g. converting to Parquet) than Dataset A",
        "Dataset A tasks more commonly involve direct model performance comparison using quantitative metrics than Dataset B",
        "Dataset B contains more tasks requiring identification of beginner-friendly resources/entry-level models than Dataset A",
        "Dataset A tasks more frequently require verification of technical implementation details (e.g. bit precision settings)",
        "Dataset B includes more tasks involving commercial licensing verification and permissible use cases than Dataset A"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks require finding research paper citations (e.g. arXiv papers) for model architectures/methods, while A focuses on API documentation navigation",
        "Dataset B tasks involve analyzing model architecture details (e.g. vSDXL-2) while A focuses on application parameters like temperature settings",
        "Dataset B requires identifying model training datasets through generated outputs, while A focuses on direct dataset content inspection",
        "Dataset B tasks include specific benchmark performance checks (MMMU), while A focuses on general evaluation metrics from model cards",
        "Dataset B contains tasks about non-commercial licensing restrictions, while A focuses on general commercial/research compliance",
        "Dataset B requires multi-step model generation+analysis (e.g. generate image then find training data), while A uses direct search/filter workflows",
        "Dataset B tasks involve academic research components (paper findings, benchmark comparisons), while A focuses on practical implementation",
        "Dataset B includes model restriction analysis (e.g. \"report issues accessing models\"), while A focuses on availability/update status",
        "Dataset B tasks require cross-referencing model versions (preview vs base vs instruct), while A focuses on latest/new models",
        "Dataset B contains explicit ethics-related queries (ethical conversational AI), while A focuses on technical capability analysis"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset B tasks require locating models/datasets with explicit commercial integration capabilities (e.g., \"integrating into commercial products\"), while Dataset A focuses on general enterprise features",
        "Dataset B contains tasks requiring interaction with non-English language content (e.g., German installation instructions), which are absent in Dataset A",
        "Dataset B includes tasks requiring direct paper/research artifact retrieval (e.g., \"download its source\"), while Dataset A focuses on technical documentation",
        "Dataset B tasks involve version history tracking of models/datasets (e.g., \"version history of codegen-350M-mono\"), not present in Dataset A",
        "Dataset B contains error resolution tasks post-learning (e.g., troubleshooting after NLP course completion), absent in Dataset A",
        "Dataset B requires GitHub platform integration (e.g., GitHub Actions setup), while Dataset A focuses solely on HF ecosystem tools",
        "Dataset B tasks demand beginner-friendly technical guides (e.g., \"beginner's guide to text-to-image models\"), whereas Dataset A assumes intermediate technical proficiency",
        "Dataset B includes multimodal generation tasks beyond core NLP (e.g., image generation from text), while Dataset A focuses primarily on text-based models",
        "Dataset B contains dataset content analysis tasks (e.g., \"math problems with solutions\"), whereas Dataset A focuses on dataset metadata analysis",
        "Dataset B requires explicit software installation/license verification (e.g., \"installation instructions for NexaAIDev\"), while Dataset A focuses on usage restrictions"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks focus on basic resource discovery (e.g. 'Find a model for X') rather than comparative analysis of multiple resources",
        "Dataset B requires direct interaction with model deployment processes (e.g. 'Deploy a NLP model') while A focuses on configuration understanding",
        "Dataset B emphasizes troubleshooting specific error messages (e.g. 'Resolve Task not found error') as primary tasks",
        "Dataset B contains tasks requiring installation/implementation steps (e.g. 'Install Nexa SDK on Mac') not present in A",
        "Dataset B tasks frequently involve accessing raw files (e.g. 'Find commit history', 'Access README') rather than curated metadata",
        "Dataset B includes commercial license verification tasks (e.g. 'Check commercial use permissions') as explicit requirements",
        "Dataset B tasks focus on single-resource investigation rather than cross-referencing multiple resources",
        "Dataset B requires interaction with academic research components (e.g. 'Find research papers', 'Access arXiv papers')",
        "Dataset B contains tasks about platform infrastructure (e.g. 'Learn about Hub repositories', 'Understand pricing models')",
        "Dataset B emphasizes legal/compliance aspects (e.g. 'Report unlicensed model use') not present in A's tasks"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset B tasks require verifying model alignment with ethics policies for adoption suitability",
        "Dataset B tasks involve converting dataset formats (e.g., to Parquet) during analysis",
        "Dataset B requires checking model usage licenses and commercial restrictions",
        "Dataset B tasks focus on CPU inference optimization techniques using specific libraries",
        "Dataset B includes integration tasks between models and external formats/tools (Parquet, local deployment)",
        "Dataset B tasks require understanding dataset structuring conventions",
        "Dataset B contains explicit requirements to locate model source code implementations",
        "Dataset B tasks involve extracting medical domain-specific information from datasets",
        "Dataset B emphasizes model adoption criteria beyond technical capabilities",
        "Dataset B includes multilingual translation requirements with specific language pair constraints"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Tasks in dataset B emphasize promotional offers and pricing models (e.g., $199/year for unlimited access) not present in dataset A.",
        "Dataset B includes queries for courses with explicit AI integration in certifications (e.g., 'AI Python for Beginners') whereas dataset A does not prioritize this.",
        "Tasks in dataset B require comparing career-aligned roles (e.g., 'Social Media Strategist') with granular salary metrics, unlike dataset A's general role exploration.",
        "Dataset B tasks focus on skill requirements for specific job roles (e.g., 'skills required for a data analyst'), while dataset A focuses on course attributes like duration or prerequisites.",
        "Queries in dataset B target guided projects (e.g., 'guided project for Python') absent in dataset A's task samples.",
        "Dataset B includes newer or trending topics like TikTok marketing and Generative AI engineering, whereas dataset A emphasizes established domains like IoT or blockchain.",
        "Tasks in dataset B involve validating course reviews and ratings (e.g., 'read reviews for AI applications course') more frequently than dataset A.",
        "Dataset B highlights financial metrics (e.g., median salary figures like '$151,424') directly in role descriptions, while dataset A references salary data indirectly.",
        "Queries in dataset B prioritize recency filters (e.g., 'newest data science courses') compared to dataset A's focus on static metadata like duration or partner institutions.",
        "Dataset B tasks explore enrollment workflows (e.g., 'enroll in IBM Data Science Certificate') and curriculum details more explicitly than dataset A."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Dataset B includes navigation tasks focused on social impact and human rights education, while Dataset A does not",
        "Dataset B requires identification of courses/programs with explicit ethical frameworks (e.g., finance for social good)",
        "Tasks in Dataset B involve searching for courses available in non-English languages (e.g., German)",
        "Dataset B contains tasks requiring comparison of university admission processes and deadlines",
        "Dataset B emphasizes Python programming integration within data science course requirements",
        "Navigation tasks in Dataset B specifically target career development pathways within technical domains",
        "Dataset B requires identification of AI courses with direct data analysis applications as core components",
        "Tasks in Dataset B involve detailed analysis of course module structures (e.g., specific IBM AI course modules)",
        "Dataset B includes comparative analysis of leadership-focused courses rather than purely technical comparisons",
        "Dataset B contains explicit requirements for free beginner courses in creative fields (e.g., graphic design)"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset B tasks require identifying specific skills gained from courses (e.g., 'skills gained from Python for Everybody'), while Dataset A focuses on structural details like module video counts or partner institutions.",
        "Dataset B includes queries for career-oriented paths (e.g., 'how to become a Data Analyst'), whereas Dataset A emphasizes academic program formats (e.g., Degrees vs. Certificates).",
        "Dataset B tasks involve newer/niche topics like Generative AI, Prompt Engineering, and Agile Project Management, while Dataset A centers on traditional domains like Renewable Energy or Quantum Physics.",
        "Dataset B requires comparing courses (e.g., 'Compare Python courses for business professionals'), while Dataset A focuses on filtering by explicit parameters like duration or rating thresholds.",
        "Dataset B tasks prioritize industry-specific solutions (e.g., 'Healthcare Analytics,' 'Digital Transformation'), whereas Dataset A emphasizes cross-referencing instructor information across courses.",
        "Dataset B includes queries for definitions (e.g., 'definition of data analytics') and qualitative feedback (e.g., 'what people are saying about a course'), absent in Dataset A tasks.",
        "Dataset B tasks target technical tools (e.g., TensorFlow, Docker, AWS) and programming languages, while Dataset A focuses on broader technical skills like Python or Agile.",
        "Dataset B requires identifying the newest courses (e.g., 'newest GRE preparation courses'), whereas Dataset A tasks involve static catalog exploration (e.g., 'list three free courses').",
        "Dataset B tasks emphasize enrollment actions (e.g., 'Enroll in Programming Languages, Part A'), while Dataset A focuses on informational extraction (e.g., 'summarize instructor bio').",
        "Dataset B includes queries for career-specific certifications (e.g., 'Google Project Management Professional Certificate'), whereas Dataset A emphasizes university partnerships (e.g., 'TUM course details')."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B tasks require identifying career outcome data by job title (e.g., Data Analyst role) rather than general career statistics",
        "Dataset B includes queries about financial aspects of courses (e.g., interest-free costs) not present in A",
        "Tasks in B require navigation to professional certificate enrollment pages rather than just metadata extraction",
        "Dataset B contains explicit requests to filter courses by language preferences (e.g., English subtitles)",
        "B includes tasks requiring configuration of course settings/preferences (e.g., language filters)",
        "Dataset B queries focus on behavioral finance and personal finance topics rather than general finance courses",
        "Tasks in B emphasize hands-on learning integrations (e.g., Python + Finance combinations) rather than standalone skills",
        "Dataset B requires identifying courses with reinforcement learning applications in finance (niche specialization)",
        "B tasks involve comparing courses across broader skill areas (leadership + technical) rather than single disciplines",
        "Dataset B includes explicit queries about applied degree programs (e.g., Master's in Applied Data Science)"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require verifying admission processes and enrollment prerequisites for degree programs, while Dataset A does not",
        "Dataset B emphasizes identifying courses with specific tool integration (e.g., Electric VLSI EDA Tool), whereas Dataset A focuses on general AI/ML content verification",
        "Dataset B tasks demand comparison of refund policies and financial investment details, which are absent in Dataset A requirements",
        "Dataset B includes navigation for career path alignment assessments between courses and job roles, unlike Dataset A's general career-oriented filtering",
        "Dataset B requires explicit identification of course/program accreditation statuses, while Dataset A only verifies institution partnerships",
        "Dataset B tasks involve exploring degree program concentrations/specializations within broader academic programs, not present in Dataset A",
        "Dataset B contains queries about financial aid availability and scholarship options, which are absent from Dataset A tasks",
        "Dataset B emphasizes verification of language localization features (e.g., English-taught courses) more prominently than Dataset A",
        "Dataset B requires identification of course-specific technical prerequisites (e.g., software/hardware requirements), while Dataset A focuses on skill-level prerequisites",
        "Dataset B tasks demand comparison of learning formats (self-paced vs instructor-led) across similar programs, unlike Dataset A's duration-based filtering"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Tasks in B require detailed interaction with specific paper sections (e.g., 'related work', 'background') not explicitly required in A",
        "B includes tasks requiring troubleshooting (e.g., HTML conversion errors) absent in A",
        "B tasks involve finding/referencing citations within papers while A focuses on metadata extraction",
        "B requires understanding document structure/formatting (e.g., TeX files, licensing info) beyond A's metadata requirements",
        "B contains tasks requiring cross-referencing with external academic platforms (e.g., dblp) not present in A",
        "B includes explicit paper download instructions while A focuses on content access/parsing",
        "B tasks require author-specific searches (e.g., 'Dr. Schaffer's publications') rather than A's general author metadata retrieval",
        "B contains tasks using full arXiv identifiers (e.g., 2412.18601) rather than A's general ID usage",
        "B includes technical implementation details (e.g., quantum middleware architecture) as search targets unlike A's domain-focused queries",
        "B requires interpreting content for specific technical definitions (e.g., 'stablecoin trilemma') while A focuses on factual extraction"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Dataset B tasks require direct interaction with paper-specific technical elements (source code downloads/error messages) not present in A",
        "Dataset B emphasizes precise location of intra-paper components (figures/references/sections) while A focuses on document-level retrieval",
        "Dataset B contains tasks requiring understanding of paper revision/resubmission processes absent in A",
        "Dataset B includes interdisciplinary research queries spanning multiple academic domains unlike A's discipline-specific tasks",
        "Dataset B tasks frequently require cross-referencing between paper content and external experimental frameworks (e.g., Muon g-2)",
        "Dataset B contains explicit requirements for comparing/contrasting multiple paper versions (e.g., v3 submission dates)",
        "Dataset B tasks involve complex author research beyond basic metadata (specific publications/career tracking)",
        "Dataset B requires interpretation of technical diagrams/visualizations within papers",
        "Dataset B includes queries about emerging technology applications in academic publishing (e.g., Quantum-HPC middleware)",
        "Dataset B tasks demand integration of mathematical proofs/theorems with experimental results in paper analysis"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset B requires direct title-based paper searches while Dataset A focuses on category/identifier-based queries",
        "Dataset B tasks involve retrieving full-text sections (e.g. results, methodology) rather than just abstracts/introductions",
        "Dataset B includes troubleshooting tasks (e.g. layout errors, HTML accessibility issues) not present in Dataset A",
        "Dataset B contains tasks requiring citation analysis and reference tracking between papers",
        "Dataset B requires handling of paper download/format selection workflows (PDF vs HTML)",
        "Dataset B includes license verification and copyright information retrieval tasks",
        "Dataset B tasks involve author-specific searches beyond general author metadata extraction",
        "Dataset B requires navigation through paper version histories and supplementary materials",
        "Dataset B contains tasks requiring cross-repository validation (arXiv IDs vs external references)",
        "Dataset B includes legal/compliance aspects (terms of use, privacy policy navigation)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset B tasks require locating and interpreting non-standard paper sections (e.g., background, design figures)",
        "Dataset B includes tasks involving source code retrieval from research papers",
        "Dataset B requires handling ordinal references (e.g., '10th reference') rather than journal references",
        "Dataset B tasks emphasize PDF format retrieval over HTML format differentiation",
        "Dataset B contains tasks requiring verification of paper existence in specific repositories",
        "Dataset B includes technical content comprehension tasks (e.g., spectroscopic methods, lensing analysis)",
        "Dataset B tasks involve direct section navigation (e.g., 'Recovering the Ionization Fraction' subsection)",
        "Dataset B requires identification of architectural components in technical papers",
        "Dataset B contains more open-ended search queries without temporal constraints",
        "Dataset B includes tasks requiring validation of publication timelines (e.g., 2025 papers)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Dataset A tasks require precise timeframe filtering (e.g., last day/week) while Dataset B tasks use broader temporal references",
        "Dataset A involves merchandise/store interactions absent in Dataset B tasks",
        "Dataset B tasks focus on paper downloading/format retrieval not present in Dataset A",
        "Dataset A requires counting/quantification of results (e.g., number of papers) unlike Dataset B",
        "Dataset B includes troubleshooting/process guidance tasks (e.g., submission issues) not found in Dataset A",
        "Dataset A tasks specify exact version history checks (e.g., v3 dates) while Dataset B focuses on current versions",
        "Dataset B contains author contribution analysis requirements absent in Dataset A",
        "Dataset A requires cross-referencing between submission dates and content updates, unlike Dataset B",
        "Dataset B tasks involve citation/reference tracing within papers not required in Dataset A",
        "Dataset A includes organizational structure queries (e.g., leadership team) missing from Dataset B"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Tasks in Dataset A require extracting specific article details (author names, event locations) while Dataset B tasks focus on broader content retrieval without such granularity",
        "Dataset A tasks involve explicit time constraints (e.g. 'within last two days') while Dataset B includes open-ended temporal requirements",
        "Dataset B contains tasks requiring navigation to specialized platforms (BBC Sounds, Super Bowl events) not present in Dataset A",
        "Dataset A emphasizes regional filtering (Europe/Asia-specific queries) while Dataset B features multi-region cross-referencing",
        "Dataset B includes tasks involving future event tracking (e.g. January 2025 fixtures) absent in Dataset A",
        "Dataset A requires parsing hierarchical subsection structures (e.g. 'War in Ukraine') while Dataset B tasks use flatter navigation",
        "Dataset B contains multimedia consumption tasks (podcasts/video highlights) not emphasized in Dataset A",
        "Dataset A tasks demand comparative analysis (country counts in leaderboards) while Dataset B focuses on singular information retrieval",
        "Dataset B includes interactive content management tasks (bookmarking information) absent in Dataset A",
        "Dataset A maintains consistent news article focus while Dataset B spans diverse formats (weather forecasts, hotel features, stock updates)"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks in dataset B require locating multimedia content (podcasts, videos) through direct navigation rather than metadata extraction from articles",
        "Dataset B tasks involve exploratory navigation of website structure/sections without predefined content targets (e.g. 'familiarize with layout')",
        "Time sensitivity requirements are implicit rather than explicitly stated in dataset B tasks (vs. explicit recency filters in A)",
        "Dataset B contains service-oriented tasks requiring form interactions (course enrollment, weather forecasts) rather than pure content consumption",
        "Geographic filtering in B extends beyond news regions to localized services/education (Cornwall courses, Gayo tourism)",
        "Tasks in B require identification of persistent content (schedules, membership figures) rather than time-bound news updates",
        "Dataset B includes financial market tracking requirements absent from A's news-focused tasks",
        "B's tasks demand navigation through multi-platform content (podcast directories, hotel sites) rather than single-domain news hierarchy",
        "Interactive content exploration (video watching, podcast browsing) is required in B beyond article/text consumption in A",
        "Dataset B contains meta-navigation tasks assessing user orientation ('explore sections') rather than information retrieval objectives"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset A tasks require navigating through specific sub-sections (e.g., 'Green Living', 'War sections', 'Horse Racing') while Dataset B focuses on broader categories without granular sub-section navigation",
        "Dataset A tasks require precise timestamp verification (e.g., 'within last two days') while Dataset B tasks accept more flexible recency parameters",
        "Dataset A includes tasks requiring identification of author names and publication dates, while Dataset B tasks omit these metadata requirements",
        "Dataset B tasks involve real-time weather data analysis (e.g., 'weather forecast for cities') unlike Dataset A",
        "Dataset A requires interaction with specialized content formats (e.g., 'Leaderboards', 'SpecialList') absent in Dataset B",
        "Dataset B tasks include podcast navigation (e.g., 'BBC Football Daily podcast') while Dataset A focuses on written articles",
        "Dataset A contains tasks requiring mathematical calculations (e.g., 'count which country has most players') not present in Dataset B",
        "Dataset B tasks involve international diplomatic content analysis (e.g., 'Syria conflict', 'Israel-Gaza updates') while Dataset A focuses on domestic reporting",
        "Dataset A requires navigation through entertainment-specific categories (e.g., 'Music News', 'Hollywood') absent from Dataset B's scope",
        "Dataset B tasks include interactive elements (e.g., 'book a room at Tokyo hotel') while Dataset A focuses on passive information retrieval"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Dataset B tasks require navigating to technical/scientific topics (e.g., graphene, cardiac physiology) absent in Dataset A",
        "Dataset B includes explicit research objectives involving academic institutions (e.g., university courses) not present in Dataset A",
        "Dataset B contains tasks requiring interaction with multimedia controls (e.g., full-screen mode, video pausing) beyond basic playback",
        "Dataset B tasks involve specific historical event research (e.g., 2004 tsunami) rather than only current affairs",
        "Dataset B includes navigation to external organization websites (e.g., NASA) not required in Dataset A tasks",
        "Dataset B tasks require comparative analysis of environmental impacts (e.g., flying alternatives) beyond basic impact reporting",
        "Dataset B contains explicit charity/donation-related objectives (e.g., Christmas donations) absent in Dataset A",
        "Dataset B includes multi-location weather data retrieval (hour-by-hour forecasts across cities) not seen in Dataset A",
        "Dataset B tasks involve specific podcast episode retrieval (e.g., Sporting Giants series) rather than general cultural content",
        "Dataset B requires analysis of future economic trends (e.g., 2025 remote work) rather than current business impact assessments"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks in dataset B require accessing real-time or future-dated event information (e.g., '2025 New Year celebrations')",
        "Dataset B includes tasks involving niche/localized topics (e.g., 'Gayo Arabica Coffee region', 'Tayside & Central Scotland')",
        "Tasks in B demand interaction with multimedia beyond articles (e.g., podcasts, nature walks, hotel service videos)",
        "Dataset B requires verification of claims/political statements (e.g., 'Trump's Panama Canal claims')",
        "B tasks involve retrieving specific technical/course details (e.g., 'BSc Cardiac Physiology course')",
        "Dataset B includes exploratory instructions without predefined categories (e.g., 'Browse through different BBC sections')",
        "Tasks in B focus on granular incident reports (e.g., 'New Orleans attack details', 'Sydney Hobart yacht race deaths')",
        "Dataset B requires accessing sports fixtures/updates with exact dates (e.g., 'Premier League February 2025')",
        "B tasks involve cross-referencing geopolitical conflicts (e.g., 'Eastern Europe conflicts with Ukraine/Russia/Azerbaijan')",
        "Dataset B includes forward-looking tech/trend analysis (e.g., 'remote work trends 2021-2025', 'Elon Musk's X evolution')"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset A tasks require multi-step filtering with exact attribute specifications (e.g., 'memory foam material'), while Dataset B tasks use broader search terms (e.g., 'affordable skincare')",
        "Dataset A emphasizes technical specifications (e.g., '1TB disk size', '10x zoom'), whereas Dataset B focuses on lifestyle/general categories (e.g., 'gourmet food', 'gift ideas')",
        "Dataset B contains tasks specifically targeting luxury/pre-owned items (e.g., 'pre-loved Louis Vuitton'), while Dataset A focuses on standard used condition filters",
        "Dataset A tasks frequently require cross-referencing multiple attributes simultaneously (e.g., 'price + size + rating'), while Dataset B tasks often seek single-attribute optimization (e.g., 'cheapest shampoo')",
        "Dataset B includes brand-specific searches (e.g., 'Acer Nitro V', 'Homtiem Black Garlic'), whereas Dataset A emphasizes product category searches",
        "Dataset A tasks explicitly verify return/shipping policies, while Dataset B tasks focus more on purchase completion",
        "Dataset B contains more exploratory tasks (e.g., 'browse top-selling products'), while Dataset A focuses on targeted product retrieval",
        "Dataset A requires configuration matching (e.g., 'purple yoga mat with 5mm thickness'), whereas Dataset B accepts general category matches (e.g., 'wireless earbuds')",
        "Dataset B includes multi-product purchase tasks (e.g., 'multiple Harry Potter books'), while Dataset A focuses on single-item purchases",
        "Dataset A tasks emphasize quantitative thresholds (e.g., '300 sq ft capacity'), while Dataset B uses qualitative descriptors (e.g., 'high-end floor lamp')"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks in dataset B focus on broader product categories without specifying detailed attributes (e.g., 'Buy a gift for her') compared to dataset A's granular attribute requirements (e.g., 'hypoallergenic mattress topper with memory foam').",
        "Dataset B includes tasks related to account creation (e.g., 'Create an Amazon account'), absent in dataset A.",
        "Dataset B tasks often lack explicit price range constraints (e.g., 'Find the price of car chargers') compared to dataset A's strict price filters (e.g., '$50\u2013$100').",
        "Dataset B tasks involve purchasing generic items (e.g., 'Add 1 item to cart') without specifying features, unlike dataset A's requirement for specific product states (e.g., 'Used - Good').",
        "Tasks in dataset B emphasize Prime-exclusive services (e.g., 'Rent the movie \"Inception\" on Prime Video'), not present in dataset A.",
        "Dataset B includes luxury/pre-owned item searches (e.g., 'pre-owned Louis Vuitton clutch handbags'), while dataset A focuses on mainstream consumer goods.",
        "Dataset B tasks frequently omit customer review/rating criteria (e.g., 'Find gourmet seasoning sets') compared to dataset A's emphasis on '4+ stars' ratings.",
        "Dataset B contains seasonal/event-driven tasks (e.g., 'Buy an Amazon gift card for Christmas') absent in dataset A.",
        "Tasks in dataset B require fewer hierarchical menu navigations (e.g., 'Browse electronics') compared to dataset A's multi-step category\u2192subcategory\u2192product navigation.",
        "Dataset B includes vague instructions like 'Find alternatives to...' or 'Browse exercise equipment,' whereas dataset A tasks have tightly scoped parameters."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks in B emphasize purchasing luxury or designer-branded products (e.g., Oscar de la Renta, Louis Vuitton).",
        "Dataset B includes tasks requiring navigation to Amazon Fresh for grocery items (e.g., bananas, fresh fruits).",
        "B involves interactions with Prime-exclusive services (e.g., Prime Video, Grubhub benefits).",
        "Tasks in B focus on gift customization (e.g., graduation-themed gift cards, gift baskets).",
        "Dataset B requires exploration of subscription-based content (e.g., Kindle Unlimited, Audible audiobooks).",
        "B includes tasks related to seasonal or event-driven promotions (e.g., Winter Sale, NFL Wild Card games).",
        "Tasks in B involve eco-friendly or sustainable product categories (e.g., eco-friendly sponges, solar lights).",
        "Dataset B emphasizes brand-specific navigation (e.g., Amazon Basics, Shopbop).",
        "B includes tasks requiring interaction with Amazon\u2019s bundled services (e.g., Prime membership benefits).",
        "Dataset B tasks often target niche audiences (e.g., pet diaper accessories, resistance bands for fitness)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Dataset B tasks focus on purchasing generic product categories (e.g., 'pet supplies', 'office supplies') rather than specific technical specifications",
        "Dataset B includes open-ended gift shopping tasks without predefined filters (e.g., 'gifts for woman', 'holiday gift ideas')",
        "Dataset B requires identification of brand-specific products (e.g., 'Louis Vuitton', 'Aquazzura') more frequently than Dataset A",
        "Dataset B contains tasks focused on identifying 'most expensive' items rather than budget-constrained searches",
        "Dataset B includes event-based shopping tasks (e.g., 'Winter Sale deals') not present in Dataset A",
        "Dataset B tasks emphasize product categories as shopping themes (e.g., 'eco-friendly', 'luxury') rather than detailed attribute combinations",
        "Dataset B contains more vague quantity-based actions (e.g., 'Add 3 items', 'Add some items') without specific product requirements",
        "Dataset B includes brand exploration tasks (e.g., 'find out more about Luxury Store') rather than direct purchases",
        "Dataset B tasks focus on 'best selling' status rather than customer rating thresholds",
        "Dataset B requires identification of product lines/series (e.g., 'Harry Potter series') rather than individual items"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Dataset B tasks involve purchasing generic items without detailed attribute specifications (e.g., 'Add a laptop to cart') while A requires precise attribute-based filtering (e.g., 'hypoallergenic mattress topper with memory foam').",
        "Dataset B includes non-navigation tasks like testing CAPTCHA functionality, absent in A.",
        "Dataset B tasks emphasize gift-oriented actions (e.g., 'Buy an Amazon eGift Card as a Christmas gift') whereas A focuses on personal-use product acquisition.",
        "Dataset B lacks explicit price-range filtering requirements (e.g., 'Find birthday gift ideas') present in most A tasks (e.g., 'priced between $50 to $100').",
        "Dataset B prioritizes bulk cart additions (e.g., 'Add 5 items to cart') without post-search verification steps required in A (e.g., checking return policies).",
        "Dataset B contains broad browsing tasks (e.g., 'Browse different categories') while A requires navigating category-specific sections with granular filters.",
        "Dataset B tasks focus on discovering popular/trending items (e.g., 'most popular winter clothing') whereas A emphasizes niche feature searches (e.g., 'water-resistant design').",
        "Dataset B includes price lookup tasks without comparison requirements (e.g., 'Find the price of a Dole banana') unlike A's price comparison mandates.",
        "Dataset B tasks involve adding items to cart as primary objectives (e.g., 'Add Wahl USA Pet Shampoo to cart') while A requires criteria verification before cart actions.",
        "Dataset B contains meta-tasks related to website functionality (e.g., CAPTCHA testing) absent in A's product-focused workflows."
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Dataset B tasks include basic arithmetic operations (e.g., 3\u00d75) absent in Dataset A's advanced computational focus",
        "Dataset B contains explicit requests for general knowledge retrieval (e.g., 'Find information about Earth's climate') not emphasized in Dataset A",
        "Dataset B includes linguistic/definition-based queries (e.g., word lookups, paradox explanations) unlike Dataset A's pure STEM focus",
        "Dataset B features introductory-level equation solving with step-by-step requests (e.g., 4x+3=19) vs Dataset A's complex calculus/differential equations",
        "Dataset B contains meta-queries about Wolfram Alpha's own capabilities (e.g., 'Explore website features') not present in Dataset A",
        "Dataset B includes historical/contextual research tasks (e.g., Industrial Revolution) while Dataset A focuses on real-time data applications",
        "Dataset B shows emphasis on chemical structure visualization requests (e.g., Lewis structures) rather than Dataset A's quantitative conversions",
        "Dataset B contains basic statistical concept lookups (e.g., beta distribution definitions) vs Dataset A's applied statistical analysis tasks",
        "Dataset B includes financial information queries (e.g., foreign debt, company finances) as primary targets rather than Dataset A's financial metric calculations",
        "Dataset B features astronomical event queries (e.g., solar eclipse timing) contrasting with Dataset A's physics/engineering energy calculations"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Dataset B tasks include requests for historical/cultural information (e.g., King Charles III, Mahabharata) absent in A",
        "Dataset B contains linguistic analysis tasks (e.g., word etymology, language comparisons) not present in A",
        "Dataset B requires exploration of philosophical/logical concepts (e.g., Liar Paradox) unlike A's concrete calculations",
        "Dataset B includes biographical/person-focused queries (e.g., Ganesha information) while A focuses on impersonal data",
        "Dataset B tasks request visualizations/downloads of mathematical results (e.g., derivative plots) more explicitly than A",
        "Dataset B contains meta-queries about Wolfram Alpha's own capabilities/features not found in A's task-oriented requests",
        "Dataset B includes chemical compound information requests (e.g., eicosapentaenoic acid structure) without conversion/stoichiometry focus seen in A",
        "Dataset B tasks involve mythological/religious content searches (e.g., Hindu deities) absent from A's scientific focus",
        "Dataset B requires exploration of mathematical foundations (e.g., prime number definitions) rather than A's applied computations",
        "Dataset B contains explicit requests for generating functions/code-related mathematics (e.g., A000108) not present in A"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Dataset B tasks involve retrieving time-series data (e.g., monthly temperature anomalies over multiple years)",
        "Dataset B requires understanding astronomical event timings (e.g., moon phases, sunrise/sunset times)",
        "Dataset B includes tasks focused on conceptual explanations (e.g., \"Explain chemical thermodynamics\")",
        "Dataset B tasks involve linguistic analysis (e.g., word etymology, natural language semantics)",
        "Dataset B requires medical/health-related data retrieval (e.g., cancer information, health metrics)",
        "Dataset B includes open-ended exploration of mathematical relationships (e.g., Fibonacci-Collatz connections)",
        "Dataset B tasks involve future event predictions (e.g., next solar eclipse timing)",
        "Dataset B requires statistical distribution parameter analysis (e.g., beta distribution root calculations)",
        "Dataset B includes humanities-focused queries (e.g., historical paradoxes, artistic concepts)",
        "Dataset B tasks require format-specific data output handling (e.g., saving molecular structures in XLS)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Dataset A tasks primarily require direct numerical computation (e.g., integrals, unit conversions, physical calculations), while Dataset B includes exploratory queries about definitions, theories, or conceptual explanations (e.g., etymology, paradoxes, theorem definitions).",
        "Dataset A tasks emphasize structured real-world applications (e.g., energy production, metabolic properties), whereas Dataset B includes abstract or theoretical inquiries (e.g., polyhedra properties, paradoxes).",
        "Dataset B tasks explicitly involve platform-specific features (e.g., pricing inquiries, Pro plan details, resource exploration), while Dataset A focuses solely on external problem-solving.",
        "Dataset A tasks demand step-by-step procedural solutions (e.g., solving integrals, deriving equations), whereas Dataset B includes requests for static information retrieval (e.g., chemical properties, stock data).",
        "Dataset B tasks frequently involve educational or homework-oriented goals (e.g., math problem generators, step-by-step homework help), while Dataset A prioritizes technical or engineering use cases.",
        "Dataset B includes linguistic or humanities-focused queries (e.g., verb etymology, paradox definitions), which are absent in Dataset A\u2019s strictly STEM-oriented tasks.",
        "Dataset A tasks require comparisons between methodologies or entities (e.g., packing densities, food calorie comparisons), while Dataset B focuses on singular entity exploration (e.g., ProductLog function, Mars sidereal day).",
        "Dataset B tasks often reference Wolfram Alpha\u2019s internal datasets or tools (e.g., \u201cexplore Wolfram resources,\u201d \u201cdownload chemical properties\u201d), whereas Dataset A treats the platform as a computational blackbox.",
        "Dataset B includes financial planning tasks (e.g., annuity present value, investment calculations) with practical user-centric goals, while Dataset A\u2019s financial queries are purely analytical (e.g., stock price averages).",
        "Dataset B tasks occasionally involve meta-level actions (e.g., verifying platform permissions, reviewing download formats), which are absent in Dataset A\u2019s straightforward computational requests."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Dataset B tasks emphasize basic factual queries (e.g., element properties, boiling points) rather than advanced computational problem-solving.",
        "Dataset B includes explicit requests for formatted outputs (e.g., TeX, image downloads) not present in Dataset A tasks.",
        "Dataset B contains exploratory or instructional tasks (e.g., 'learn about paradoxes', 'explore polyhedra types') alongside computational goals.",
        "Dataset B tasks frequently involve simple validations (e.g., checking if a number is even/odd) without multi-step reasoning.",
        "Dataset B tasks focus on single-fact retrieval (e.g., GDP values, chemical properties) rather than comparative analyses between entities.",
        "Dataset B includes user-specific personal metric calculations (e.g., BMI, basal metabolic rate) with individualized inputs.",
        "Dataset B tasks require basic equation solving (e.g., linear equations) without derivatives or integrals seen in Dataset A.",
        "Dataset B tasks involve climate model analyses with temporal parameters (e.g., percent change from 2004 to 2025).",
        "Dataset B includes astronomical event queries (e.g., solar eclipses, exoplanet data) requiring time-based or spatial data retrieval.",
        "Dataset B tasks target corporate financial metrics (e.g., company performance) rather than macroeconomic calculations like GDP comparisons in Dataset A."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Dataset A tasks require exact numerical thresholds (e.g., 4 stars, 30 minutes), while Dataset B tasks use qualitative terms (e.g., 'good reviews') or lack specific numerical constraints.",
        "Dataset A tasks explicitly require generating ingredient shopping lists, whereas Dataset B tasks do not mention this action.",
        "Dataset A tasks consistently demand checking and noting nutrition facts (e.g., calories per serving), while Dataset B tasks omit nutritional detail requirements.",
        "Dataset B includes tasks focused on repurposing leftover ingredients, which are absent in Dataset A.",
        "Dataset B tasks frequently request kid-friendly recipes, a criterion not emphasized in Dataset A.",
        "Dataset A tasks combine multiple specific constraints (e.g., dietary, time, rating), while Dataset B tasks often have simpler or single criteria.",
        "Dataset B tasks involve modifying recipes (e.g., adapting without a candy thermometer), whereas Dataset A focuses on direct retrieval.",
        "Dataset B tasks prioritize occasion-specific recipes (e.g., Christmas desserts) more prominently than Dataset A.",
        "Dataset A tasks require parsing recipe titles/descriptions for explicit ingredients, while Dataset B tasks reference broader ingredient categories.",
        "Dataset B tasks include exploratory queries (e.g., 'find ideas for BBQ recipes'), while Dataset A tasks are strictly structured with defined parameters."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Dataset B tasks focus on broader meal planning and holiday menu preparation, while Dataset A requires specific nutritional or time-based filtering.",
        "Dataset B includes tasks for substituting ingredients (e.g., evaporated milk alternatives), which are absent in Dataset A.",
        "Dataset B emphasizes user-generated content interaction (e.g., leaving reviews/ratings) as a core task component, unlike Dataset A.",
        "Dataset B contains explicit requests for managing saved recipes (e.g., 'save a low-calorie recipe'), while Dataset A only references saving as a functionality.",
        "Dataset B features occasion-specific queries for non-traditional holidays/events (e.g., Halloween snacks), whereas Dataset A focuses on general occasions.",
        "Dataset B includes comparative analysis of multiple recipe versions (e.g., 'compare BBQ sauce recipes'), while Dataset A focuses on identifying single 'best version' recipes.",
        "Dataset B tasks require nutritional value comparisons between recipes, whereas Dataset A only prioritizes nutritional requirements in isolation.",
        "Dataset B contains queries for kid-friendly meal preparation constraints, which are not present in Dataset A's requirements.",
        "Dataset B includes specific diet plan integration tasks (e.g., Paleo, keto meal prep), while Dataset A focuses on general dietary preferences.",
        "Dataset B tasks involve multi-step recipe adaptation (e.g., modifying flavors in dip recipes), unlike Dataset A's direct information retrieval focus."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Dataset A tasks require precise numeric thresholds (e.g., 'under 600 calories', '50+ reviews') while Dataset B uses qualitative terms (e.g., 'highly-rated', 'leftover')",
        "Dataset A tasks involve multi-step comparisons (e.g., 'list 3 recipes', 'compare variations') while Dataset B focuses on single-item retrieval",
        "Dataset A explicitly requires extracting specific nutritional metrics (e.g., 'carbs per serving') while Dataset B only requests general nutrition information presence",
        "Dataset A tasks mandate strict rating/review count filters (e.g., '4.5 stars or higher') while Dataset B accepts broader quality indicators",
        "Dataset A requires navigating through multi-level category hierarchies (e.g., 'Dinners > Chicken') while Dataset B uses flat category navigation",
        "Dataset A tasks include explicit save/bookmark actions as required steps while Dataset B mentions saving as optional secondary actions",
        "Dataset A specifies exact preparation time constraints (e.g., 'under 30 minutes') while Dataset B uses relative time references (e.g., 'quick and easy')",
        "Dataset A requires identification of recipe variations (e.g., zucchini vs traditional lasagna) while Dataset B focuses on single recipe types",
        "Dataset A tasks demand ingredient quantity extraction for shopping lists while Dataset B only requires ingredient presence verification",
        "Dataset A includes complex filtering combinations (e.g., dietary + time + rating) while Dataset B uses single-criterion filters"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Dataset B tasks frequently involve dietary-specific searches (keto, gluten-free, low-carb) not emphasized in Dataset A",
        "Dataset B includes tasks requiring interaction with recipe reviews (posting/editing) beyond decision-making in Dataset A",
        "Dataset B tasks focus on leftover ingredient utilization (turkey, cranberry sauce) absent in Dataset A requirements",
        "Dataset B emphasizes holiday-specific recipe execution (Christmas desserts/appetizers) rather than general seasonal content in A",
        "Dataset B contains explicit meal prep planning tasks using recipe collections, unlike Dataset A's single-recipe focus",
        "Dataset B prioritizes kid-friendly meal constraints (snacks, dinners) more prominently than Dataset A's general audience approach",
        "Dataset B incorporates price checking/product research tasks (KitchenAid mixers) not present in Dataset A's pure recipe focus",
        "Dataset B requires ingredient substitution inquiries in recipes, demonstrating adaptive cooking needs absent in Dataset A",
        "Dataset B tasks integrate external resource navigation (Amazon cookbook searches) beyond internal recipe databases in Dataset A",
        "Dataset B emphasizes specific protein-centric cuisine types (Italian sausage, Asian chicken) rather than Dataset A's broader regional categories"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks in dataset B require user interaction beyond filtering (e.g. leaving recipe reviews/suggestions)",
        "Dataset B includes navigation tasks focused on repurposing leftovers or excess ingredients",
        "Dataset B emphasizes seasonal/holiday-specific recipe discovery (e.g. Thanksgiving desserts, Christmas appetizers)",
        "Tasks in dataset B involve direct engagement with recipe creators through review systems",
        "Dataset B contains navigation paths for budget-conscious cooking and affordable meal planning",
        "Tasks in dataset B require exploring recipe variations/adaptations (e.g. healthier versions, substitutions)",
        "Dataset B includes navigation to specific cooking method sections (e.g. slow cooker, air fryer recipes)",
        "Tasks in dataset B emphasize community Q&A aspects (e.g. asking substitution questions)",
        "Dataset B contains navigation paths focused on meal prep strategies and storage tips",
        "Tasks in dataset B require comparing multiple recipe versions through user ratings/reviews"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Dataset A tasks require users to report example sentences for queried words, while Dataset B tasks do not explicitly require example sentences.",
        "Dataset A includes tasks involving conversion of the website language (e.g., to Deutsch), whereas Dataset B tasks focus on translations (e.g., English\u2013Spanish) without language toggle requirements.",
        "Dataset A tasks explicitly mention identifying UK/US pronunciation differences, while Dataset B tasks focus on individual pronunciations without direct comparison.",
        "Dataset A tasks involve accessing the Cambridge Dictionary Shop section, which is absent in Dataset B tasks.",
        "Dataset A tasks require users to explore grammar rules (e.g., articles, passive voice) with specific examples, while Dataset B tasks mention grammar exploration but lack explicit rule-based examples.",
        "Dataset A tasks include word game interactions (e.g., Word Scramble) with timed challenges, whereas Dataset B tasks mention games only as general exploration.",
        "Dataset A tasks demand IPA notation reporting for pronunciations, while Dataset B tasks omit IPA requirements.",
        "Dataset A tasks require cross-referencing synonyms/antonyms, while Dataset B tasks focus on direct synonym identification without cross-referencing.",
        "Dataset A tasks involve navigating alphabetical indexes (A\u2013Z) for dictionary entries, whereas Dataset B tasks emphasize direct word searches without alphabetical navigation.",
        "Dataset B tasks include requests for syllable counts and word relations (e.g., \"hello\" and \"meeting\"), which are absent in Dataset A."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Dataset A tasks require specific element retrieval (e.g., exact IPA notations, countable/uncountable noun examples) while Dataset B focuses on broader concept exploration (e.g., 'learn about adjectives')",
        "Dataset A tasks emphasize direct UK/US English comparisons for pronunciation/grammar, whereas Dataset B includes general language feature inquiries without explicit regional comparisons",
        "Dataset A tasks demand explicit navigation to dedicated sections (e.g., Word Scramble game, Shop) while Dataset B tasks involve open-ended exploration of undefined areas",
        "Dataset A requires identification of quantitative information (e.g., number of word meanings) absent in Dataset B tasks",
        "Dataset B contains meta-navigation tasks testing error handling/website exploration mechanics, unlike Dataset A's focused information retrieval",
        "Dataset A tasks specify exact content formats (e.g., 'write down', 'report translation') while Dataset B uses vague instructions like 'learn about' or 'explore'",
        "Dataset B includes collaborative content creation tasks (e.g., 'Add content to Wikipedia') not present in Dataset A",
        "Dataset A requires cross-referencing between dictionary entries and grammar rules, while Dataset B tasks maintain singular focus per task (definitions OR grammar)",
        "Dataset B features linguistic pattern recognition tasks (e.g., collocations, word combinations) absent from Dataset A's straightforward lookup requirements",
        "Dataset A tasks consistently require multimedia interaction (audio pronunciations) while Dataset B focuses purely on text-based information retrieval"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Dataset B tasks involve exploring idioms/phrases (e.g., 'another/bite at the cherry') while Dataset A focuses on individual words.",
        "Dataset B tasks require interaction with a 'Popular searches' section (e.g., 'work', 'schedule'), absent in Dataset A.",
        "Dataset B tasks include explicit requests for synonyms/thesaurus usage (e.g., 'synonyms of inspire'), unlike Dataset A.",
        "Dataset B tasks target educational content creation (e.g., 'prepare teaching materials'), not present in Dataset A.",
        "Dataset B tasks involve English\u2013Spanish translation comparisons, while Dataset A focuses on English\u2013French/German.",
        "Dataset B tasks reference test preparation (e.g., TOEFL), whereas Dataset A tasks lack this context.",
        "Dataset B tasks include broader exploration of website features (e.g., 'Explore the features of the dictionary webpage').",
        "Dataset B tasks emphasize mental states/emotions (e.g., 'depression', 'unhappy'), absent in Dataset A.",
        "Dataset B tasks require cookie consent interactions (e.g., 'Accept Cookies'), not mentioned in Dataset A.",
        "Dataset B blog content focuses on gradual processes (e.g., 'bit by bit'), while Dataset A highlights price-related phrases."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Dataset B tasks require exploring synonyms for words (e.g., 'die', 'communication') explicitly, while A focuses on word definitions and example sentences without synonym emphasis.",
        "Dataset B includes tasks requiring multi-word translation lookups (e.g., 'apple' and 'Friday'), whereas A tasks focus on single-word translations.",
        "Dataset B tasks involve exploring related terms (e.g., 'communication' and 'jukebox, jujitsu'), while A tasks prioritize isolated word analysis.",
        "Dataset B tasks explicitly target adjective-specific grammar information (e.g., 'adjectives'), while A grammar tasks are broader (e.g., articles, passive voice).",
        "Dataset B includes financial terminology lookup (e.g., 'reinvest', 'reinvestment'), which is absent in A's sampled tasks.",
        "Dataset B tasks involve completing thematic quizzes (e.g., animal-related), whereas A focuses on the Word Scramble game exclusively.",
        "Dataset B tasks require phrase-level definition lookup (e.g., 'in a nutshell'), while A tasks focus on single-word definitions.",
        "Dataset B tasks include explicit requests for etymology explanations, which are not present in A's samples.",
        "Dataset B tasks emphasize comprehensive parts-of-speech research (e.g., 'verbs', 'nouns'), while A focuses on specific grammar rules (e.g., comparatives, articles).",
        "Dataset B tasks specify social media sharing on Twitter (e.g., 'statuette'), whereas A tasks mention general social sharing without platform specificity."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Dataset A tasks require explicit interaction with dictionary games (e.g., Word Scramble), while B does not include game-related tasks",
        "Dataset B tasks involve phrase/idiom translation (e.g., 'break a leg') rather than single-word translation prevalent in A",
        "Dataset B includes social sharing functionality requirements (e.g., 'Share the definition') absent in A",
        "Dataset A tasks specify time-bound challenges (e.g., 'beat the clock') not present in B's exploratory tasks",
        "Dataset B requires identification of collocations and semantic relationships between terms, unlike A's standalone definitions",
        "Dataset A features structured hierarchical navigation (dictionary\u2192grammar\u2192translations), while B emphasizes open-ended exploration of features",
        "Dataset B tasks focus on multi-word expressions/phrasal verbs (e.g., 'bit by bit'), whereas A focuses on individual lexical items",
        "Dataset A includes commercial interactions (e.g., 'browse Cambridge Dictionary Shop') not present in B",
        "Dataset B tasks require differentiation between grammatical categories (e.g., adverb vs adjective phrases) beyond A's basic grammar rule lookup",
        "Dataset A emphasizes explicit UK/US variant comparisons, while B focuses on pronunciation/phonetic analysis without regional specification"
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Dataset B tasks require identifying accessory compatibility details (e.g., watch band specifications)",
        "Dataset B includes tasks focused on Apple's environmental initiatives and carbon neutrality efforts",
        "Dataset B requires navigation through enterprise/business solution pages",
        "Dataset B tasks involve accessing healthcare-specific product applications",
        "Dataset B contains tasks requiring comparison of audio product lines (AirPods models)",
        "Dataset B tasks demand interaction with family sharing management features",
        "Dataset B includes tasks related to Apple's privacy policies and data usage explanations",
        "Dataset B requires navigation through app-specific update histories and award information",
        "Dataset B tasks involve finding battery optimization guidance and device maintenance information",
        "Dataset B contains tasks requiring identification of business/education success case studies"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Dataset B includes tasks requiring customization/personalization workflows (e.g., designing Apple Watch, configuring device purchases with AppleCare+)",
        "Dataset B contains warranty status verification and repair/service management tasks",
        "Dataset B requires navigation through enterprise/business-specific purchase programs and plans",
        "Dataset B includes battery optimization guidance across multiple device types",
        "Dataset B contains tasks related to Family Sharing configuration and parental controls",
        "Dataset B requires accessing corporate/financial information (earnings reports, investor relations)",
        "Dataset B includes environmental impact comparisons between specific product components/materials",
        "Dataset B contains device ecosystem integration tasks (Mac-iPhone connectivity features)",
        "Dataset B requires navigation through data privacy/protection policy documentation",
        "Dataset B includes App Store content discovery tasks (award winners, specific app features)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks in B require broader exploration of product categories (e.g., 'Mac computers') rather than specific model configurations",
        "B includes tasks requiring identification of business/enterprise use cases and success stories",
        "B contains explicit requests for financial/business information (e.g., earnings reports)",
        "Tasks in B more frequently involve cross-category accessory pairing (e.g., iPhone cases with MacBooks)",
        "B requires navigation through institutional-specific educational pricing portals",
        "Tasks in B emphasize environmental report comparisons across product lines",
        "B includes explicit requests for healthcare-specific product implementations",
        "Tasks in B require differentiation between product lines and professional solutions (e.g., Herm\u00e8s vs standard models)",
        "B contains more purchasing workflow tasks (e.g., 'configure and purchase') with customization",
        "Tasks in B focus on identifying compatibility between devices and ecosystem integration"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks require navigating enterprise-specific support options (e.g., AppleCare Help Desk for businesses)",
        "Dataset B contains queries about corporate documentation retrieval (e.g., Business Conduct Policy, financial reports)",
        "Dataset B emphasizes family account management features (e.g., Family Sharing setup, purchase restrictions)",
        "Dataset B includes tasks requiring configuration of business-specific solutions (e.g., enterprise device management)",
        "Dataset B tasks involve deeper financial workflows (e.g., business financing options, bulk enterprise purchases)",
        "Dataset B requires navigation to product sustainability information (e.g., environmental impact reports)",
        "Dataset B contains queries about newer product variants (e.g., AirPods 4, iPhone 16 Pro in Desert Titanium)",
        "Dataset B tasks focus on device personalization for enterprise use (e.g., bulk iPhone configurations for businesses)",
        "Dataset B includes localization-specific retail tasks (e.g., checking store hours, regional availability)",
        "Dataset B requires interaction with privacy/security features (e.g., iCloud+ Hide My Email, data tracking policies)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Dataset B tasks emphasize business/enterprise product configurations (e.g., iPad Pro for business) while A focuses on consumer-level customization",
        "Dataset B includes explicit troubleshooting tasks (e.g., cracked screen repairs, battery life extension) not present in A's technical support focus",
        "Dataset B contains healthcare-specific queries (Apple Watch health features, Health Records enrollment) absent from A's general support tasks",
        "Dataset B tasks require app-specific research (version history, reviews) while A focuses solely on hardware specifications",
        "Dataset B includes privacy/security investigations (data handling, parental controls) not found in A's authentication-focused support tasks",
        "Dataset B features accessory compatibility checks for specific models (iPhone 16 Silicone Case) versus A's general compatibility lists",
        "Dataset B tasks involve warranty status checks and AppleCare purchases while A focuses on warranty management systems",
        "Dataset B requires price comparison across product categories (AirPods vs Watches) unlike A's single-category model comparisons",
        "Dataset B includes corporate/financial inquiries (quarterly results, business success stories) absent from A's consumer focus",
        "Dataset B tasks demand integration research (iPhone-Watch connectivity) while A focuses on standalone device specifications"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Dataset B tasks require accessing or modifying user settings/help (e.g., language translation settings, Google Search settings), while Dataset A focuses purely on data retrieval without configuration steps.",
        "Dataset B includes tasks explicitly related to health/medical information (e.g., flu symptoms, diabetes), whereas Dataset A does not involve health-related queries.",
        "Dataset B tasks involve job search and professional services (e.g., software engineering jobs, Google Ads specialist meetings), absent in Dataset A\u2019s task scope.",
        "Dataset B requires locating tutorials/guides (e.g., woodworking tutorials, YouTube tutorials), while Dataset A tasks do not involve instructional content retrieval.",
        "Dataset B tasks demand interaction with educational or self-improvement content (e.g., learning Spanish, climate change research), which Dataset A lacks.",
        "Dataset B includes event planning or local service discovery (e.g., casual venues, taekwondo practice locations), unlike Dataset A\u2019s focus on global/static data.",
        "Dataset B tasks target academic research papers (e.g., quantum computing, machine learning), whereas Dataset A prioritizes general factual extraction (e.g., movie release dates).",
        "Dataset B tasks involve product purchasing details (e.g., Apple Watch price, Pixel 9 Pro specs), while Dataset A focuses on non-commercial factual data (e.g., player statistics).",
        "Dataset B emphasizes user-specific localization (e.g., \"near me,\" zip-code-based real estate data), whereas Dataset A\u2019s location parameters are broader (e.g., country-level).",
        "Dataset B includes tasks requiring content editing or contribution (e.g., Wikipedia edits), absent in Dataset A\u2019s passive data retrieval tasks."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks frequently involve user-generated content interaction (e.g., recipe ratings, woodworking tutorials) while A focuses on established platforms",
        "Dataset B requires more personal data input/management (e.g., dietary restrictions, shopping lists) compared to A's passive retrieval",
        "Dataset B contains transactional tasks (e.g., ticket purchases, job applications) absent in A's information-only focus",
        "Dataset B emphasizes educational/self-improvement content (language learning, parenting advice) unlike A's factual reporting",
        "Dataset B includes future-oriented queries (2025 strategies, upcoming movies) while A focuses on current/past data",
        "Dataset B tasks require credibility assessment of sources (author verification) not emphasized in A",
        "Dataset B features health management scenarios (blood pressure recipes, symptom research) absent in A",
        "Dataset B contains event planning components (venue research, amenities comparison) not present in A",
        "Dataset B includes more interactive content curation (recipe databases, project ideas) versus A's static data retrieval",
        "Dataset B tasks involve practical daily living activities (cooking, DIY projects) unlike A's academic/statistical focus"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset B tasks emphasize exploratory learning or conceptual understanding (e.g., 'What is machine learning?') while A focuses on concrete data retrieval",
        "Dataset B includes tasks requiring procedural knowledge (e.g., 'how to enable language translation') where A focuses on factual recall",
        "Dataset B contains tasks about future projections (e.g., 'movie trailers 2025') while A focuses on current/past verifiable facts",
        "Dataset B features tasks involving knowledge synthesis (e.g., 'effects of climate change') whereas A requires direct information extraction",
        "Dataset B includes career/service-oriented tasks (e.g., job applications, venue bookings) while A focuses on information lookup",
        "Dataset B contains health/medical guidance tasks (e.g., flu prevention) not present in A",
        "Dataset B features academic/research objectives (e.g., paper analysis) more prominently than A's general domain queries",
        "Dataset B includes creative/interpretive tasks (e.g., recipe creation) where A focuses on predefined metrics",
        "Dataset B contains tasks requiring comprehension of complex systems (e.g., stock comparisons) vs A's singular value comparisons",
        "Dataset B features open-ended exploration tasks (e.g., 'find corporate event ideas') while A specifies exact numerical targets"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Dataset B tasks frequently involve transactional actions (e.g., booking hotels, purchasing products, reserving venues) whereas Dataset A focuses solely on information retrieval.",
        "Dataset B includes tasks requiring interaction with user-specific inputs (e.g., date ranges, location filters, budget constraints) for personalized results, while Dataset A prioritizes universally applicable factual queries.",
        "Dataset B tasks often involve accessing and synthesizing instructional or procedural content (e.g., tutorials, guides, tips) rather than purely factual data.",
        "Dataset B tasks require navigating commercial platforms (e.g., hotel booking sites, product stores) more frequently than Dataset A, which focuses on search engines and informational websites.",
        "Dataset B includes explicit requests for actionable steps (e.g., \"book it,\" \"configure,\" \"buy\"), whereas Dataset A tasks emphasize data extraction without direct user action beyond searching.",
        "Dataset B tasks often involve health-related queries (e.g., symptoms, eligibility criteria, treatment options) absent in Dataset A, which centers on entertainment, sports, and technical specifications.",
        "Dataset B tasks require parsing user reviews, ratings, or availability statuses (e.g., hotel stars, venue availability) as critical criteria, while Dataset A focuses on objective rankings or numerical metrics.",
        "Dataset B includes tasks targeting professional or business needs (e.g., event planning, job searches, marketing tools) not present in Dataset A's consumer-oriented queries.",
        "Dataset B tasks frequently involve multi-source comparisons (e.g., product features, pricing across vendors) beyond simple side-by-side evaluations common in Dataset A.",
        "Dataset B tasks include future-oriented planning (e.g., \"must-have tools for beginners,\" \"2025 travel destinations\"), whereas Dataset A focuses on historical or real-time data."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Tasks in dataset B require interaction with website-specific features (e.g., forms, modals, settings) for task completion, while dataset A tasks focus on direct data extraction without interaction.",
        "Dataset B includes tasks involving purchasing or transactional actions (e.g., booking hotels, buying tickets), whereas dataset A tasks are strictly informational with no transactional intent.",
        "Dataset B tasks frequently involve procedural instructions (e.g., software installation, configuration steps), while dataset A tasks emphasize retrieving pre-existing numerical or categorical data.",
        "Dataset B tasks often explore educational or professional program details (e.g., university degrees, certification requirements), whereas dataset A tasks prioritize real-time metrics or rankings.",
        "Tasks in dataset B include subjective or opinion-based queries (e.g., 'best recipes,' 'event ideas'), while dataset A tasks target objective, verifiable facts.",
        "Dataset B contains health/medical information retrieval tasks (e.g., symptom causes, treatment guidelines), which are absent in dataset A.",
        "Dataset B tasks require navigation to specialized institutional websites (e.g., CDC, academic journals), while dataset A tasks primarily use general search engines or mainstream platforms.",
        "Dataset B includes tasks related to event planning and promotion strategies, which are not present in dataset A.",
        "Dataset B tasks involve job search or career-related information (e.g., job openings, skill requirements), while dataset A does not.",
        "Dataset B tasks require managing or modifying user-specific data (e.g., search history, account settings), whereas dataset A tasks are purely informational without personalization."
      ]
    }
  },
  "diffs_real_from_synth": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Dataset B tasks specifically request information about parking facilities (e.g., motorcycle parking, EV charging) while Dataset A focuses on general business/service locations",
        "Dataset B includes explicit requirements to identify charging infrastructure (e.g., Tesla Destination Chargers) not present in Dataset A tasks",
        "Dataset B tasks frequently reference zip code-based location searches (e.g., 90028, 80202) while Dataset A uses city/landmark-based references",
        "Dataset B contains tasks requiring analysis of review content specifics (e.g., 'what a one-star review says') rather than just rating comparisons in Dataset A",
        "Dataset B includes explicit printing/sharing functionality requirements (e.g., 'print map as PDF', 'sharing link') absent from Dataset A",
        "Dataset B tasks specify exact numeric rating thresholds (e.g., 'greater than 4.8') while Dataset A uses relative terms like 'highly-rated'",
        "Dataset B contains intersection-based location queries (e.g., 'main street and Amherst street') not seen in Dataset A",
        "Dataset B includes operational hour constraints with exclusion criteria (e.g., 'not open 24 hours') beyond Dataset A's temporal filters",
        "Dataset B tasks require identification of specific infrastructure levels/features (e.g., 'which level has least proportion in reviews') not found in Dataset A",
        "Dataset B references national/state-level landmarks (e.g., 'Castle Mountains National Monument') while Dataset A focuses on urban/city landmarks"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset B tasks emphasize multi-leg trip planning (e.g., 'Plan a trip from X to Y to Z') rather than simple A\u2192B route requests",
        "Dataset B requires analysis of review metadata patterns (e.g., 'which level has least proportion in reviews') beyond basic review retrieval",
        "Dataset B includes post-search actions like map exporting/sharing (e.g., 'print map as PDF', 'generated sharing link')",
        "Dataset B queries target technical infrastructure specifications (e.g., 'Tesla Destination Charger', 'EV charging supported parking')",
        "Dataset B contains explicit requests for charging network types rather than generic EV mentions",
        "Dataset B tasks require comparative analysis of facility levels/zones within single locations (e.g., airport terminal analysis)",
        "Dataset B includes precise temporal constraints for parking (e.g., 'closes at night') beyond general hours",
        "Dataset B tasks demand geographic hierarchy awareness (e.g., 'in the state of California called...') for disambiguation",
        "Dataset B contains procedural multi-step operations (e.g., 'first search X, then find way to share')",
        "Dataset B explicitly requests critique analysis of negative reviews (e.g., 'check what one-star review says')"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset A tasks frequently involve booking/reservation requirements (e.g., hotel availability dates, restaurant reservations) while Dataset B does not",
        "Dataset B includes technical infrastructure queries (e.g., EV charging stations, Tesla Destination Chargers) absent in Dataset A",
        "Dataset B requires map interface manipulation tasks (e.g., printing as PDF, sharing links) not present in Dataset A",
        "Dataset A emphasizes personal preference refinement (e.g., cuisine types, budget tiers) while Dataset B focuses on technical specifications",
        "Dataset B contains tasks requiring information synthesis from map visualizations (e.g., airport level analysis) unlike Dataset A",
        "Dataset B includes specialized service provider searches (e.g., plumbers, locksmiths with specific hours) not found in Dataset A",
        "Dataset A features complex multi-destination routing while Dataset B focuses on point-to-point navigation",
        "Dataset B requires analysis of review distribution patterns (e.g., least reviewed airport levels) absent in Dataset A",
        "Dataset B specifies vehicle-specific parking needs (motorcycle/bicycle) while Dataset A uses generic parking references",
        "Dataset B includes map data export/sharing tasks (PDF generation, link sharing) not required in Dataset A"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Dataset B tasks focus on identifying specific infrastructure points (e.g. parking facilities, EV chargers) rather than accommodations/services",
        "Dataset B requires analyzing hierarchical location data (e.g. airport levels) not present in Dataset A",
        "Dataset B tasks involve zip code-based proximity searches while Dataset A uses landmark-based references",
        "Dataset B includes technical transportation specifications (motorcycle parking, EV charging support) absent in A",
        "Dataset B tasks demand direct review content analysis (e.g. reading specific low-rated reviews)",
        "Dataset B requires map export/sharing functionality (PDF generation, link sharing) not mentioned in A",
        "Dataset B emphasizes parking logistics (closure times, parking types) rather than accommodation logistics",
        "Dataset B tasks combine negative filters (\"available now but not open 24 hours\") unlike A's positive filters",
        "Dataset B includes national monument/nature reserve information retrieval not present in A",
        "Dataset B tasks focus on commercial chains (Apple Stores, Tesla Chargers) rather than generic service categories"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Tasks in dataset B explicitly request quantitative outputs (e.g., '5 beauty salons', 'least proportion in reviews') while A focuses on qualitative filtering",
        "Dataset B requires direct interaction with map technical features (e.g., 'print map as PDF', 'share map link') unlike A",
        "B contains tasks demanding analysis of review sentiment components (e.g., 'what a one-star review says') whereas A only uses ratings as thresholds",
        "Dataset B specifies infrastructure-focused searches (e.g., 'motorcycle parking', 'EV charging parking') while A emphasizes service establishments",
        "Tasks in B require identifying hierarchical location aspects (e.g., 'which level has...') not present in A's queries",
        "B includes explicit geographic container requirements (e.g., 'in the state of California', 'within 2 miles of zip code') where A uses relative proximity",
        "Dataset B tasks demand binary availability confirmation (e.g., 'available now but not open 24 hours') while A uses temporal ranges",
        "B contains precise operational hour matching (e.g., 'closes at night') whereas A uses simpler 'open now' status checks",
        "Tasks in B require direct facility-type verification (e.g., 'Tesla Destination Charger') while A focuses on general accessibility features",
        "Dataset B includes explicit instruction sequencing (e.g., 'first search X then find Y') as task requirements unlike A's concurrent filtering"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in dataset B require specifying exact numerical thresholds (e.g., stars, forks) for repository searches, while A uses broader popularity criteria.",
        "Dataset B tasks frequently involve time-bound constraints (e.g., repositories updated within last 10 days), whereas A focuses on static timeframes like update dates without recency requirements.",
        "Dataset B includes tasks requiring summarization of repository objectives or project purposes, which are absent in A.",
        "Dataset B tasks demand identification of top contributors or recent commit file changes, while A focuses on general commit history extraction.",
        "Dataset B explicitly requires validation of element existence (e.g., Readme files) during repository searches, unlike A.",
        "Tasks in B involve ranking developers/repositories (e.g., 'currently ranked first this month'), whereas A lacks ranking-based requirements.",
        "Dataset B tasks require listing exact feature counts (e.g., '3 features') from product pages, while A focuses on general feature exploration.",
        "Dataset B includes granular programming language + domain combinations (e.g., 'C# game development'), whereas A uses broader language filters.",
        "Tasks in B specify concrete output formats (e.g., 'name of changed files, total additions'), while A focuses on information extraction without formatting constraints.",
        "Dataset B requires identification of 'most popular' items through quantitative comparisons (e.g., highest stars/forks), while A uses qualitative popularity indicators like 'trending'."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Dataset B tasks emphasize repository discovery using specific criteria (e.g., stars, update dates, topics) while Dataset A focuses on retrieving predefined information (e.g., pricing, API docs).",
        "Dataset B requires summarizing features/objectives (e.g., 'list 3 features') whereas Dataset A tasks demand precise extraction of technical/legal details (e.g., data usage policies).",
        "Dataset B includes real-time validation (e.g., checking email existence during sign-up) absent in Dataset A.",
        "Dataset B tasks frequently involve trending/ranked content (e.g., 'trending Python repositories') unlike Dataset A.",
        "Dataset B specifies exact numerical thresholds (e.g., '200 stars', '50 stars') while Dataset A uses broader criteria (e.g., 'high-severity').",
        "Dataset B tasks prioritize recency (e.g., 'last 10 days', 'last week') for repositories, unlike Dataset A\u2019s time-agnostic queries.",
        "Dataset B includes user interaction simulations (e.g., sign-up flow steps) whereas Dataset A focuses on passive navigation.",
        "Dataset B tasks target repository metadata analysis (e.g., contributors, commit changes) while Dataset A emphasizes service comparisons (e.g., Copilot vs. Enterprise).",
        "Dataset B requires identifying 'most popular' or 'highest' metrics (e.g., stars, forks) absent in Dataset A\u2019s tasks.",
        "Dataset B tasks involve parsing dynamic content (e.g., Readme files, course actions) while Dataset A focuses on static documentation (e.g., security advisories)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Dataset B tasks require specific temporal constraints (e.g., 'last 30 days', 'last 10 days') for repository searches, while Dataset A tasks use more general time references",
        "Dataset B tasks explicitly request quantitative outputs (e.g., 'list 3 features', '50 or more stars'), while Dataset A focuses on qualitative understanding",
        "Dataset B includes verification tasks (e.g., 'see if email exists') during sign-up processes, which are absent in Dataset A",
        "Dataset B tasks require identification of specific file types (e.g., Readme) in repositories, unlike Dataset A's general repository searches",
        "Dataset B tasks demand trending/realtime data analysis (e.g., 'currently ranked first this month'), while Dataset A focuses on static information retrieval",
        "Dataset B includes explicit version control tasks (e.g., 'files changed in last commit'), whereas Dataset A focuses on general repository navigation",
        "Dataset B tasks require ranking/sorting by popularity metrics (e.g., 'most stars', 'highest number of forks'), while Dataset A compares plan features without ranking",
        "Dataset B tasks specify programming language constraints in searches (e.g., 'C++', 'Python'), while Dataset A's language filters are more general",
        "Dataset B includes mobile-specific feature inquiries (e.g., 'Copilot chat on mobile'), which are absent in Dataset A tasks",
        "Dataset B tasks require identification of contributor information (e.g., 'top three contributors'), while Dataset A focuses on general organizational structure"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks in B require specifying exact time frames (e.g., 'last 30 days', 'last 10 days') for repository searches/filtering, while A uses broader date ranges",
        "B tasks explicitly require identifying numerical thresholds for stars/forks (e.g., '200 stars', '500 stars') in repository criteria, unlike A",
        "B focuses on retrieving specific repository contributor details and commit histories, which are not present in A's tasks",
        "Tasks in B demand identification of trending/ranking metrics ('currently ranked first', 'most popular') not required in A",
        "B requires verification of account existence/status through direct interaction (e.g., email check), while A focuses on account creation/upgrades",
        "Tasks in B explicitly request extraction of quantitative metrics from repositories (e.g., 'total additions/deletions', 'number of forks') not seen in A",
        "B tasks require identifying specific course content details in GitHub Skills (e.g., 'actions learners will perform'), while A focuses on general resource discovery",
        "B tasks involve precise identification of project purposes/features from repository descriptions, whereas A focuses on general capability exploration",
        "Tasks in B require cross-referencing multiple dynamic filters simultaneously (language + stars + timeframe + topic), while A uses simpler filtering criteria",
        "B tasks explicitly require working with real-time/recent data ('last week', 'last 2 days'), whereas A tasks use static temporal references"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Dataset B tasks require locating repositories with explicit criteria on creation/update dates (e.g., 'last 30 days', 'last 10 days'), whereas Dataset A focuses on general temporal filtering (e.g., 'last updated') without strict recency constraints.",
        "Dataset B emphasizes repository discovery with granular parameters (e.g., 'at least 200 stars', 'Readme file', 'specific programming language'), while Dataset A uses broader repository search criteria (e.g., 'language', 'stars').",
        "Tasks in Dataset B frequently require summarizing repository objectives or main features (e.g., 'summarize its main objective'), whereas Dataset A focuses on extracting factual product information (e.g., 'pricing details').",
        "Dataset B includes explicit identification of top contributors or commit details (e.g., 'list the top three contributors', 'files changed in the last commit'), which are absent in Dataset A tasks.",
        "Dataset B tasks involve trending/repository popularity rankings (e.g., 'trending Python repositories with most stars'), while Dataset A does not reference trending metrics.",
        "Dataset B requires locating educational resources (e.g., 'Resolve merge conflicts course in GitHub Skills'), whereas Dataset A focuses on technical documentation for APIs/project management.",
        "Dataset B tasks demand validation of account-specific conditions (e.g., 'check if email exists during sign-up'), while Dataset A handles general account management (e.g., 'sign-up', 'email verification').",
        "Dataset B includes precise cost calculations (e.g., 'yearly cost in USD for Copilot Individual'), whereas Dataset A asks for plan comparisons without monetary computations.",
        "Dataset B requires identifying 'highest' or 'most' metrics (e.g., 'project with the most stars', 'highest number of forks'), which are absent in Dataset A's qualitative tasks.",
        "Dataset B tasks involve extracting structured lists (e.g., 'list 3 features', '2 customer stories'), while Dataset A focuses on understanding feature descriptions or limitations."
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Tasks in B involve accessing college sports recruiting information (e.g., NCAAW recruiting top players' colleges).",
        "B requires retrieving granular player physical attributes (e.g., heaviest weight among infielders).",
        "B includes tasks focused on NFL team depth charts and positional injury statuses (e.g., 2nd position injuries).",
        "B necessitates interaction with predictive analytics like the Basketball Power Index (BPI) rankings.",
        "Tasks in B demand summarization of editorial content (e.g., main points of soccer headlines).",
        "B requires knowledge of league/division structures (e.g., NFC North team composition).",
        "B involves retrieving future-oriented projections (e.g., NFL MVP candidates for an upcoming season).",
        "B includes detailed college basketball conference matchups (e.g., SEC Network games with team rankings).",
        "B tasks cover international soccer competitions (e.g., Spanish Supercopa, Carabao Cup).",
        "B requires navigation of tennis match schedules with player nationalities (e.g., ATP/WTA tournaments)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Tasks in B frequently require identifying specific player achievements (e.g., most assists in a game) with positional context",
        "B emphasizes retrieving career-long statistics (e.g., total games played) rather than just current season performance",
        "Navigation in B often requires accessing team-specific depth charts and precise injury reports",
        "B tasks frequently involve comparing team rankings within power indexes (e.g., Basketball Power Index)",
        "B requires identification of geographic naming patterns in team names across leagues",
        "Tasks in B specifically target ESPN+ content and subscription service features",
        "B emphasizes real-time tracking of recent trades/acquisitions within 48-hour windows",
        "Navigation in B includes NCAAW recruiting data analysis not present in A",
        "B tasks require weight-based comparisons of athletes within specific positional groups",
        "B involves cross-referencing game statistics with weather/market conditions (e.g., loser high vs winner high)"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset B tasks require retrieving information about ESPN+ features and exclusive content navigation, while A focuses on ESPN+ streaming schedules",
        "Dataset B includes tasks involving player physical attributes (e.g., weight) not present in A's requirements",
        "Dataset B contains requests for historical career statistics analysis (e.g. total career games played) absent in A",
        "Dataset B requires identification of top performers through analytical comparisons (e.g. 'heaviest infielder'), unlike A's direct stat retrieval",
        "Dataset B tasks demand article content summarization from news sections, which A doesn't require",
        "Dataset B includes team composition verification tasks (e.g. division members) not found in A",
        "Dataset B contains queries about game highlight summaries rather than just score reporting in A",
        "Dataset B requires navigation through opinion/analysis content (MVP candidate articles) absent in A",
        "Dataset B tasks involve roster position-specific injury tracking not present in A's general injury reports",
        "Dataset B includes conditional game analysis (e.g. loser high vs winner high) requiring deeper data synthesis than A"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Tasks in B require identifying specific player statistics (e.g., assists, rebounds) within recent games, whereas A focuses on general career/seasonal stats retrieval.",
        "B includes tasks involving direct comparisons of player physical attributes (e.g., 'heaviest infielder'), which are absent in A's tasks.",
        "B tasks frequently require analyzing team compositions (e.g., NFC North teams, roster depth charts) rather than general standings navigation in A.",
        "B explicitly involves interpreting analytical metrics like the Basketball Power Index, while A focuses on basic standings/rankings access.",
        "B tasks demand real-time identification of yesterday's/last 48-hour game results, whereas A includes broader historical data (e.g., '2002-03 NBA stats').",
        "B requires locating injury reports within specific team depth charts, while A tasks involve general injury report searches without positional context.",
        "B tasks involve verifying statistical anomalies (e.g., 'loser high > winner high') in matchups, which A does not require.",
        "B includes ESPN+ tool functionality exploration (e.g., 'summary of ESPN+ Tools'), whereas A focuses on accessing ESPN+ content streams/shows.",
        "B tasks require identifying geographic naming patterns in team names (e.g., 'teams with Los Angeles'), absent in A's scope.",
        "B emphasizes current MVP candidate analysis in specific leagues, while A tasks involve general news article searches without candidate comparisons."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Tasks in B require analytical comparisons (e.g., identifying statistical leaders, contrasting team performance) beyond basic data retrieval.",
        "B includes queries demanding cross-sport context (e.g., verifying Los Angeles-based teams across leagues).",
        "B tasks often involve conditional logic (e.g., filtering games where losers outperformed winners in specific metrics).",
        "B requires summarizing article content (e.g., extracting main points from news headlines) rather than just locating articles.",
        "B tasks explicitly reference ESPN+ Tools functionality (e.g., exploring platform-specific features beyond content access).",
        "B contains queries about physical attributes (e.g., heaviest infielder weights) requiring deeper roster analysis.",
        "B tasks demand positional specificity (e.g., 2nd-string injuries in depth charts) rather than general injury reports.",
        "B requires temporal precision (e.g., 'within last 2 days') more frequently than A's general recency requirements.",
        "B includes multi-criteria player evaluations (e.g., combining assists totals with team/position context).",
        "B tasks involve league structure verification (e.g., NFC North team composition) rather than standalone standings checks."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Dataset B tasks require identifying models based on release recency (e.g. 'released in past month') while Dataset A focuses on general popularity metrics",
        "Dataset B tasks explicitly involve interacting with educational resources like classroom benefits and tutorial parameters (e.g. '8bit/4bit loading') not mentioned in Dataset A",
        "Dataset B requires analyzing community engagement metrics (upvotes, GitHub stars) as primary indicators of quality, whereas Dataset A focuses on download counts/update dates",
        "Dataset B contains tasks specifically about pricing plans and account tiers ('Pro account features') not present in Dataset A",
        "Dataset B requires temporal filtering for model updates ('last updated in 2022/March 2023') as core task requirement unlike Dataset A's general update timelines",
        "Dataset B tasks involve direct interaction with demo Spaces through conversational queries ('ask it which team trained you') rather than just finding Spaces",
        "Dataset B emphasizes technical configuration parameters (e.g. 'temperature parameter default value') more frequently than Dataset A",
        "Dataset B requires summarization of toolkit strengths/documentation features as explicit task goals, while Dataset A focuses on finding installation/usage instructions",
        "Dataset B tasks specifically target newest/latest resources ('latest machine learning model') as primary search criteria unlike Dataset A's general search requirements",
        "Dataset B includes tasks requiring analysis of dataset content through Dataset Viewer ('first message content') rather than just metadata extraction"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks emphasize finding models released within specific recent timeframes (e.g., 'past month') while Dataset A focuses on general recency without strict temporal bounds",
        "Dataset B requires interaction with live demo Spaces (e.g., chat interfaces) while Dataset A focuses on static model/dataset exploration",
        "Dataset B tasks involve tracking community engagement metrics (upvotes, GitHub stars) while Dataset A tracks technical metrics (download counts, performance benchmarks)",
        "Dataset B requires following step-by-step tutorials from documentation (e.g., PEFT adapter loading) while Dataset A focuses on general API usage patterns",
        "Dataset B includes content generation tasks using inference APIs (e.g., story generation) while Dataset A focuses on information retrieval",
        "Dataset B tasks require identifying newest/latest models in specific domains (e.g., 'latest fake news detection') while Dataset A seeks established models",
        "Dataset B emphasizes open-source status verification as primary filter while Dataset A includes broader license types",
        "Dataset B requires cross-referencing model cards with research papers (GitHub stars, paper mentions) while Dataset A focuses on standalone model metadata",
        "Dataset B tasks involve educational resource discovery (classroom benefits, tutorials) while Dataset A focuses on technical documentation",
        "Dataset B requires analysis of model interaction patterns (chat responses, UI parameters) while Dataset A focuses on static attribute verification"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset B tasks require real-time interaction with model inference APIs (e.g., text generation) while A focuses on API documentation lookup",
        "Dataset B requires navigation through tutorial content (e.g., PEFT adapter loading guides) rather than general technical documentation",
        "Dataset B tasks emphasize finding models with specific release recency criteria (past month) compared to A's broader timeframe requirements",
        "Dataset B includes analysis of community-curated content (daily papers with upvotes) not present in A's tasks",
        "Dataset B tasks require identification of specific model parameters (temperature settings) rather than just architecture details",
        "Dataset B contains explicit queries about subscription plan comparisons (Pro account features) while A focuses on enterprise tiers",
        "Dataset B requires summarizing toolkit strengths from documentation rather than just locating API references",
        "Dataset B tasks involve direct interaction with hosted demo applications (chat interfaces) beyond just cross-referencing",
        "Dataset B emphasizes NLP models with specific language pair requirements (en-ja) rather than general translation capabilities",
        "Dataset B tasks require identification of model applications/usecases from descriptions rather than just functional categories"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks require real-time interaction with live demo Spaces (e.g. chat interfaces)",
        "Dataset B emphasizes finding resources with explicit versioning requirements (e.g. 'March 2023 update')",
        "Dataset B tasks focus more on API parameter extraction (e.g. temperature settings)",
        "Dataset B includes requests for GitHub star counts of documentation resources",
        "Dataset B requires direct use of Inference API for content generation tasks",
        "Dataset B tasks specify commercial plan comparisons (Pro account features/pricing)",
        "Dataset B emphasizes newer release timelines ('past month' vs general popularity)",
        "Dataset B includes explicit requests for model application scenarios (travel chat generation)",
        "Dataset B tasks require interaction with dataset viewer content inspection",
        "Dataset B focuses on educational resource identification (classroom benefits, tutorials)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset B tasks focus on discovering newly released models within specific timeframes (e.g., past month) while Dataset A emphasizes finding established models",
        "Dataset B requires direct interaction with live demo Spaces (e.g., chat interfaces) whereas Dataset A focuses on static resource exploration",
        "Dataset B tasks involve extracting technical implementation details for quantization methods (4bit/8bit loading) not mentioned in Dataset A",
        "Dataset B emphasizes real-time popularity metrics tracking (daily trending papers) compared to Dataset A's historical popularity analysis",
        "Dataset B includes explicit API usage tasks (text generation via Inference API) not present in Dataset A's documentation-focused activities",
        "Dataset B requires identification of models with specific technical constraints (2022 update recency, 1M+ downloads thresholds) while Dataset A uses broader temporal filters",
        "Dataset B tasks demand parameter-level configuration analysis (temperature settings) whereas Dataset A focuses on model-level metadata",
        "Dataset B contains explicit comparisons of commercial plans (Pro account features/pricing) while Dataset A references enterprise tiers more generally",
        "Dataset B requires evaluation of toolkit capabilities summaries (Text Embeddings Inference strengths) versus Dataset A's installation/usage tasks",
        "Dataset B emphasizes multimodal output generation (3D models, chat responses) while Dataset A focuses on traditional NLP tasks"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Dataset B tasks explicitly require courses with specific rating thresholds (e.g., 4.7+), while A focuses on general comparisons without quantified rating filters.",
        "Dataset B queries emphasize granular time commitments (e.g., hours per week) rather than broader duration ranges common in A.",
        "Tasks in B frequently target module-level details (e.g., video counts, lesson names), whereas A focuses on course/specialization metadata.",
        "Dataset B includes regional partnership inquiries (e.g., Australian institutions), while A lacks geographic specificity in collaboration queries.",
        "B tasks explicitly request identification of free courses, which is not a prominent filter in A's sampled tasks.",
        "Dataset B contains queries about business/team-specific platform features (Coursera for Teams/Business), absent in A's career-focused tasks.",
        "Tasks in B specify required course components (e.g., Agile methodology modules), whereas A focuses on general topic alignment.",
        "Dataset B requires instructor biography analysis and cross-course teaching history, not emphasized in A's tasks.",
        "B tasks utilize credit eligibility and duration filters (1-4 years), while A focuses on academic pathway alignment without temporal constraints.",
        "Dataset B queries emphasize Specialization structure validation (e.g., mandatory courses within sequences), whereas A focuses on Specialization learning outcomes."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Dataset B requires identification of courses with exact numerical rating thresholds (e.g., 4.7 stars) while A uses broader rating ranges (4+ stars)",
        "Tasks in B specifically request time estimates in hours rather than duration ranges (weeks/months) used in A",
        "B includes navigation tasks requiring identification of partner institutions from specific countries (e.g., Australia)",
        "B requires extraction of module-level granular details (e.g., video counts per module) not present in A's tasks",
        "Navigation patterns in B involve comparing service offerings (Coursera for Business vs Teams) while A focuses on course comparisons",
        "B tasks require identification of future-oriented course components (e.g., Renewable Energy Futures) not emphasized in A",
        "B contains tasks requiring precise counts of filtered course results rather than qualitative comparisons in A",
        "B includes requirements to identify instructor biographies and related course offerings beyond basic metadata",
        "Tasks in B incorporate credit eligibility as a filter parameter not present in A's requirements",
        "B emphasizes identification of AI-enhanced professional certificates while A focuses on foundational technical certificates"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset B includes tasks requiring identification of courses with specific partner institutions from a particular country (e.g., Australian universities/companies), while Dataset A does not.",
        "Dataset B requires extraction of precise review percentages (e.g., rounded 5-star rating distribution), whereas Dataset A only involves rating thresholds.",
        "Dataset B includes tasks comparing business-oriented plans (Coursera for Business vs. Teams), while Dataset A focuses on individual learner program comparisons.",
        "Dataset B requires filtering by credit eligibility status for courses, which is absent in Dataset A tasks.",
        "Dataset B contains tasks demanding granular module-level metadata extraction (e.g., video counts per module), while Dataset A only requires general content verification.",
        "Dataset B involves duration filtering using hour-based thresholds (e.g., <20 hours), whereas Dataset A uses week-based duration parameters.",
        "Dataset B includes identification of renewable energy futures as a specific subtopic requirement, unlike Dataset A's general technical skill matching.",
        "Dataset B requires instructor biography analysis and cross-referencing of their multiple course offerings, while Dataset A only involves basic instructor-course mapping.",
        "Dataset B contains queries about 3D printing courses from renowned universities, a domain not present in Dataset A's technical skill requirements.",
        "Dataset B includes intermediate-level blockchain technology courses with institutional reputation verification, while Dataset A focuses on foundational technical skills like Python/Agile."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B tasks require filtering by exact rating thresholds (e.g., 4.7 stars) while A focuses on general rating ranges",
        "B tasks involve extracting precise time estimates (hours per week) whereas A uses broader duration ranges (weeks/months)",
        "B requires identifying geographic-specific partner institutions (e.g., Australian partners) while A only mentions general geographic cross-referencing",
        "Tasks in B demand comparison of Coursera's business service plans (For Business vs Teams) not present in A",
        "B contains queries requiring percentage calculations from reviews (e.g., 5-star rating percentages) that A doesn't specify",
        "Dataset B tasks require counting total matching courses after multiple filters while A focuses on individual course identification",
        "B includes requests for module-level content specifics (number/names of videos) beyond A's general course structure understanding",
        "Tasks in B specify exact completion time formats (e.g., '1-3 months with 5hrs/week') versus A's general duration ranges",
        "B requires listing all partners from specific countries while A only involves general partner-institution cross-referencing",
        "Dataset B tasks demand identification of beginner-level Specializations with specific included courses, a granularity not seen in A"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require filtering by exact numeric ratings (e.g., 4.7) while A uses broader rating thresholds (e.g., 4+ stars)",
        "Dataset B tasks demand precise completion time metrics in hours where A focuses on duration ranges in weeks/months",
        "Dataset B includes queries for regional institution partnerships (e.g., Australian universities) not emphasized in A",
        "Dataset B requires granular module-level metadata (e.g., video counts per module) absent in A's task requirements",
        "Dataset B tasks analyze review distributions (e.g., percentage of 5-star ratings) while A only verifies minimum ratings",
        "Dataset B emphasizes specific curricular components (e.g., Agile methodology modules) rather than general program structures",
        "Dataset B contains language learning specialization queries (e.g., Spanish) not present in A's career-focused tasks",
        "Dataset B includes niche technical domains (e.g., 3D printing, renewable energy) beyond A's core business/tech focus",
        "Dataset B requires instructor biography analysis and cross-course correlations not specified in A's metadata requirements",
        "Dataset B tasks demand quantitative filter result counts (e.g., total matching courses) while A focuses on qualitative comparisons"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Dataset B tasks require interaction with arXiv's non-research features (store, merchandise)",
        "Dataset B contains tasks requiring quantitative analysis of results (count papers/merchandise types)",
        "Dataset B includes tasks requiring summarization of paper objectives/hypotheses",
        "Dataset B tasks involve procedural navigation (article withdrawal processes, QR code sharing)",
        "Dataset B requires understanding of arXiv's operational structure (leadership team, help guides)",
        "Dataset B contains tasks requiring random selection from result sets",
        "Dataset B uses specific date range constraints beyond standard 'last week' filters",
        "Dataset B tasks involve interaction with external notification systems (email/Slack subscriptions)",
        "Dataset B includes multi-step analysis tasks (search \u2192 select \u2192 summarize)",
        "Dataset B requires identification of paper version history tracking (v3 submission dates)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Tasks in dataset B require explicit quantitative analysis (e.g., counting results or articles)",
        "Dataset B includes tasks involving arXiv's non-academic features (e.g., store, merchandise, leadership team)",
        "Tasks in dataset B require interaction with administrative or operational guidelines (e.g., article withdrawal)",
        "Dataset B tasks frequently involve precise date-range filtering beyond general 'recent' queries",
        "Tasks in dataset B demand summarization of specific paper sections (e.g., objectives, conclusions)",
        "Dataset B includes multi-step actions (e.g., search \u2192 count \u2192 select \u2192 summarize)",
        "Tasks in dataset B require retrieval of version-specific metadata (e.g., submission dates for v3)",
        "Dataset B tasks involve non-traditional navigation (e.g., QR code sharing, cart interactions)",
        "Tasks in dataset B include random selection from filtered results as part of the workflow",
        "Dataset B requires combining metadata fields (e.g., author first names + submission dates) in queries"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset B tasks require quantitative analysis of results (e.g. counting papers, tracking submission frequency) while A focuses on information location",
        "Dataset B includes interactions with non-research website components (e.g. merchandise store) absent in A's academic-focused tasks",
        "Dataset B requires handling of precise temporal constraints (e.g. 'within last 2 days') more frequently than A's broader time windows",
        "Dataset B tasks involve explicit result set manipulation (e.g. random selection from results) not seen in A",
        "Dataset B contains requests for operational metadata (e.g. version submission dates) beyond A's content-focused metadata needs",
        "Dataset B requires navigation through administrative interfaces (e.g. article withdrawal help) unlike A's pure research navigation",
        "Dataset B includes complex combinatorial queries (e.g. author+timespan+content) exceeding A's typical search criteria",
        "Dataset B tasks demand content summarization/interpretation where A focuses on direct extraction",
        "Dataset B involves interface exploration beyond core research features (e.g. sharing functionalities) absent in A",
        "Dataset B requires cross-referencing between temporal and categorical dimensions more rigorously than A"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset B tasks require quantitative analysis of search results (e.g., counting papers, tracking submission frequencies)",
        "Dataset B includes merchandise-related navigation tasks absent in Dataset A",
        "Dataset B tasks demand leadership/organizational information retrieval (e.g., team member names)",
        "Dataset B contains specific version history tracking requirements (e.g., v3 submission dates)",
        "Dataset B requires cross-checking between article content and submission metadata",
        "Dataset B tasks involve random selection from result sets (e.g., choose paper at random)",
        "Dataset B includes summarization requirements for paper objectives/hypotheses",
        "Dataset B contains precise date range constraints spanning multiple years",
        "Dataset B requires first-name author searches combined with technical queries",
        "Dataset B tasks involve QR code interactions and e-commerce functionalities"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Dataset B requires quantitative analysis of results (e.g., counting papers, aggregating statistics) while Dataset A focuses on information retrieval",
        "Tasks in Dataset B specifically require handling date range parameters beyond standard 'recent' filters (e.g., custom date ranges)",
        "Dataset B includes interaction with non-research content (e.g., arXiv store merchandise, organizational leadership information)",
        "Dataset B tasks require content summarization from papers rather than just extraction",
        "Dataset B contains tasks requiring comparison between multiple papers/categories",
        "Dataset B involves navigation through administrative/support features (e.g., article withdrawal process)",
        "Tasks in Dataset B require interpretation of submission version histories beyond basic version awareness",
        "Dataset B includes e-commerce interactions (e.g., adding merchandise to cart)",
        "Dataset B requires handling combined categorical and temporal constraints simultaneously",
        "Tasks in Dataset B demand multi-step inference from paper contents (e.g., identifying research goals from introductions)"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Dataset B tasks require summarization of article content (e.g., 'summarize key points'), while Dataset A focuses on article retrieval without explicit summarization requirements",
        "Dataset B includes tasks requiring identification of specific metadata like author names and publication dates, absent in Dataset A tasks",
        "Dataset B tasks involve analytical comparisons (e.g., 'count which country has most players'), unlike Dataset A's straightforward information retrieval",
        "Dataset B contains tasks referencing specialized sections like 'Green Living' and 'The SpeciaList' not mentioned in Dataset A's navigation requirements",
        "Dataset B requires identification of statistical data (e.g., 'highest number of runners'), while Dataset A tasks focus on categorical content discovery",
        "Dataset B tasks explicitly demand real-time information verification (e.g., 'latest headlines at this time'), whereas Dataset A uses relative timestamps like '3 hrs ago'",
        "Dataset B includes multi-step analysis tasks (e.g., 'browse then analyze'), while Dataset A tasks emphasize single-action navigation",
        "Dataset B requires content evaluation across hierarchical taxonomies (e.g., 'War related sections'), whereas Dataset A focuses on section navigation without taxonomy analysis",
        "Dataset B tasks involve structured data parsing from leaderboards/tables (e.g., golf rankings), absent in Dataset A's article-focused tasks",
        "Dataset B contains tasks requiring identification of corporate entities in headlines (e.g., 'which companies are involved'), not required in Dataset A"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Dataset B tasks require summarization of article content as a core objective, while Dataset A focuses on locating specific information without summarization",
        "Tasks in Dataset B explicitly request identification of article authors and publication dates, not mentioned in Dataset A requirements",
        "Dataset B includes tasks requiring comparative analysis (e.g. country rankings in sports), absent from Dataset A objectives",
        "Time sensitivity in Dataset B tasks specifies narrower time windows (e.g. 'within last two days') compared to Dataset A's general recency focus",
        "Dataset B contains tasks requiring quantitative analysis (e.g. counting sections/players), unlike Dataset A's qualitative information retrieval",
        "Tasks in Dataset B demand identification of specific content categories (e.g. 'Green Living', 'The SpeciaList') not referenced in Dataset A instructions",
        "Dataset B includes meta-analysis tasks (e.g. 'identify main headlines covering UK climate plans'), absent from Dataset A's direct information retrieval",
        "Tasks in Dataset B require synthesis of geographic impact data (e.g. 'areas affected' by disasters), more complex than Dataset A's geographic filtering",
        "Dataset B contains tasks requiring evaluation of corporate involvement (e.g. 'which companies are involved'), not present in Dataset A requirements",
        "Dataset B tasks involve multi-step comparisons (e.g. player nationalities and scores), while Dataset A focuses on single-information retrieval"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset B tasks require synthesizing information from multiple sections (e.g., combining economic and climate data), while Dataset A focuses on single-section navigation",
        "Tasks in Dataset B explicitly demand summarization of central arguments/insights, whereas Dataset A tasks emphasize article discovery without analysis",
        "Dataset B includes localized geographical subcategories (e.g., 'Glasgow & West Scotland') not present in Dataset A's regional classifications",
        "Time constraints in Dataset B are specific (e.g., 'within last two days') versus Dataset A's general recency verification ('hrs ago')",
        "Dataset B requires identification of content authorship/publishing metadata absent in Dataset A requirements",
        "Tasks in Dataset B involve quantitative analysis (e.g., counting players, ranking scores) unlike Dataset A's qualitative information retrieval",
        "Dataset B contains specialized verticals like 'Green Living' requiring niche navigation not emphasized in Dataset A",
        "Dataset B tasks frequently require cross-referencing data points across content types (text, tables, rankings) while Dataset A focuses on linear navigation",
        "Dataset B includes structured data interpretation (e.g., leaderboards, tournament results) absent from Dataset A's article-focused tasks",
        "Dataset B tasks demand critical evaluation of content relationships (e.g., economic implications of environmental policies) versus Dataset A's fact-finding objectives"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Dataset B tasks require explicit summarization directives (e.g., 'summarize central points') while Dataset A focuses on general content retrieval without mandated synthesis",
        "Dataset B includes tasks demanding identification of authorship/publication dates (metadata elements) absent in Dataset A requirements",
        "Dataset B tasks specify granular regional filters (e.g., 'impact in Europe') whereas Dataset A uses broader regional categories",
        "Dataset B contains instructions requiring comparative analysis across content types (e.g., economic vs environmental impacts) not present in Dataset A",
        "Dataset B tasks frequently mandate time-bound verification ('within last two days') while Dataset A uses relative timeframes ('latest')",
        "Dataset B includes explicit section targeting (e.g., 'Green Living section') versus Dataset A's general categorical navigation",
        "Dataset B requires quantitative analysis of content (e.g., 'count which country has most players') unlike Dataset A's qualitative exploration",
        "Dataset B tasks demand identification of specific entities (companies, author names) while Dataset A focuses on thematic content",
        "Dataset B contains multi-step verification requirements (e.g., confirm publication date AND summarize) absent in Dataset A's single-step tasks",
        "Dataset B emphasizes critical evaluation of content sources/context whereas Dataset A prioritizes information discovery breadth"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Dataset B tasks require explicit summarization of key insights and central points from articles, while Dataset A focuses more on locating information without structured summarization.",
        "Tasks in Dataset B frequently involve identifying specific named entities (e.g., author names, event locations) within content, unlike Dataset A's general topical searches.",
        "Dataset B includes tasks requiring quantitative analysis of content (e.g., counting players in leaderboards), absent in Dataset A's qualitative navigation tasks.",
        "Time constraints in Dataset B are strictly defined (e.g., 'within the last two days'), while Dataset A uses relative timeframes like 'latest' or 'recent'.",
        "Dataset B tasks require cross-referencing between quantitative data points (e.g., country representation in sports rankings), unlike Dataset A's thematic cross-referencing.",
        "Specialized content sections like 'Green Living' and 'Women's Majors' are explicitly targeted in Dataset B, while Dataset A uses broader categories like 'Sports' or 'Business'.",
        "Dataset B tasks demand identification of content authorship/publishing metadata, which Dataset A tasks do not emphasize.",
        "Navigation in Dataset B requires interaction with hierarchical leaderboards/rankings not present in Dataset A's tasks.",
        "Dataset B includes tasks requiring comparison of regional impacts (e.g., 'areas affected by natural disasters'), while Dataset A focuses on regional content access without comparisons.",
        "Tasks in Dataset B explicitly require verification of content recency through publication dates, whereas Dataset A only implies time sensitivity through terms like 'latest'."
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset B tasks require filtering by multiple specific numerical attributes simultaneously (e.g., capacity measurements, room size requirements)",
        "Dataset B emphasizes verifying exact return/delivery policies for specific item configurations during navigation",
        "Dataset B tasks frequently specify exact publication years or model years for products like books/electronics",
        "Dataset B includes tasks requiring identification of technical specifications like energy efficiency ratings or zoom capabilities",
        "Dataset B tasks demand cross-referencing protection plan durations (e.g., 2-year warranties) with specific products",
        "Dataset B requires comparing items based on combined material composition and dimensional requirements (e.g., memory foam thickness + size)",
        "Dataset B tasks often involve checking availability of specific color variants within constrained price ranges",
        "Dataset B includes explicit requirements to verify anti-feature mechanisms (e.g., anti-squirrel bird feeders)",
        "Dataset B tasks require parsing specialized product certifications (e.g., hypoallergenic certifications)",
        "Dataset B contains tasks that mandate saving/searching by exact review quantity thresholds (e.g., 500+ reviews)"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks in dataset B require applying multi-condition filters (e.g., 'Used - Good' condition + price range)",
        "Dataset B tasks demand explicit validation of return policy details per specific product variant (color/size)",
        "Dataset B includes time-sensitive event navigation (e.g., 'New year upgrades', NFL Wild Card promotions)",
        "Tasks in B specify exact numerical thresholds for customer reviews (e.g., '500+ reviews' vs general '4+ stars')",
        "Dataset B requires comparison of technical specifications (e.g., room size compatibility, battery life hours)",
        "B contains tasks involving refurbished/used product conditions not present in A's new-item focus",
        "Dataset B tasks require explicit verification of warranty/protection plan options",
        "B includes spatial measurement requirements (e.g., '30-inch length', '5mm thickness') absent in A",
        "Tasks in B demand cross-referencing multiple attributes simultaneously (pattern + color + size + price)",
        "Dataset B requires identification of energy efficiency ratings/sustainability features not mentioned in A"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks in dataset B require specifying exact numeric ranges for product attributes (e.g., $50-$100 price range, 10x zoom)",
        "Dataset B tasks frequently involve verifying product certifications/standards (e.g., hypoallergenic, energy efficiency ratings)",
        "Dataset B emphasizes checking specific warranty/protection plan details and costs",
        "Tasks in dataset B require matching multiple simultaneous physical dimensions (e.g., 30\" length, 5mm thickness)",
        "Dataset B contains explicit requirements for product condition grading (e.g., 'Used - Good' quality filters)",
        "Tasks in dataset B mandate checking availability of specific technical specifications (e.g., Windows 11 Home, 1TB disk)",
        "Dataset B requires comparing products based on exact room size/capacity requirements (e.g., 300 sq ft coverage)",
        "Tasks in dataset B involve verifying specific anti-feature mechanisms (e.g., anti-squirrel bird feeders)",
        "Dataset B contains explicit requirements for publication/release years of media products",
        "Tasks in dataset B require cross-referencing sale status with price thresholds (e.g., 'on sale and under $10')"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Dataset B tasks require multi-step filtering (e.g., filtering by condition, then price), while Dataset A tasks focus on single-step filters.",
        "Dataset B tasks explicitly require comparing prices across top search results, whereas Dataset A tasks mention price comparison without specifying result count.",
        "Dataset B tasks include specific material constraints (e.g., 'memory foam', 'stainless steel'), while Dataset A tasks mention materials more generically.",
        "Dataset B tasks emphasize verifying free shipping eligibility explicitly, while Dataset A tasks reference shipping options indirectly.",
        "Dataset B tasks require checking return policies for specific items post-purchase, while Dataset A tasks ask for general policy verification.",
        "Dataset B tasks involve exact numeric thresholds (e.g., '300 sq ft', '10x zoom'), whereas Dataset A uses broader ranges like 'under $50'.",
        "Dataset B tasks mandate saving preferred products from results (e.g., 'save the lowest priced'), while Dataset A focuses on adding items to cart directly.",
        "Dataset B tasks specify color/pattern availability checks (e.g., 'blue floral pattern'), while Dataset A tasks mention color filters without verification steps.",
        "Dataset B tasks include explicit energy efficiency or technical specs (e.g., 'energy efficiency rating'), absent in Dataset A's product specs.",
        "Dataset B tasks require counting available options (e.g., 'how many colors'), whereas Dataset A tasks focus on binary availability checks."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Dataset B tasks require multi-attribute filtering combinations (e.g., material + price + rating)",
        "Dataset B includes explicit numerical constraints for product dimensions (e.g., '30 inches length', '5mm thickness')",
        "Dataset B tasks specify exact product conditions (e.g., 'Used - Good') as filter criteria",
        "Dataset B requires validation of specific technical specifications (e.g., '1TB disk size', '10x zoom capability')",
        "Dataset B tasks demand direct comparison of quantitative metrics across products (e.g., price per sq ft capacity)",
        "Dataset B includes explicit verification of return policy details for specific item configurations",
        "Dataset B tasks require identification of temporal product attributes (e.g., 'published in 2024')",
        "Dataset B contains tasks with layered availability checks (color variations + shipping options + stock status)",
        "Dataset B requires preservation of filtered results through save/bookmark actions for later comparison",
        "Dataset B tasks involve validation of accessory/compatibility requirements (e.g., 'suitable for 300 sq ft rooms')"
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Tasks in Dataset B more frequently require multi-part answers combining numerical results with comparative analysis, while Dataset A tasks focus on single-step solutions or explanations.",
        "Dataset B tasks often involve precise physical measurements (e.g., projectile velocity, power plant output) whereas Dataset A includes broader conceptual queries (e.g., paradox definitions).",
        "Dataset B emphasizes retention of significant figures/scientific notation in results, while Dataset A does not specify numerical precision requirements.",
        "Dataset B tasks systematically compare multiple entities under constrained assumptions (e.g., food calorie comparisons at fixed serving sizes), unlike Dataset A's general comparisons.",
        "Dataset B contains more parametric problem-solving with exact numerical inputs (e.g., 45-degree launch angle, 18cm step height), whereas Dataset A uses more qualitative variables.",
        "Dataset B features complex geometric packing optimization problems absent from Dataset A's task samples.",
        "Dataset B requires explicit multi-variable metabolic calculations (combining height/weight/age/activity metrics) while Dataset A handles simpler health metrics like BMI.",
        "Dataset B tasks frequently demand dimensional analysis with compound units (e.g., energy output in specific years, memory per pixel density) compared to Dataset A's simpler unit conversions.",
        "Dataset B includes rotational transformations of mathematical objects (e.g., rotated ellipses) requiring coordinate system manipulation, not seen in Dataset A.",
        "Dataset B tasks specify exact temporal/spatial boundaries for calculations (e.g., 2023 movie prices by city) whereas Dataset A uses broader temporal references (e.g., 'current' financial data)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Dataset B tasks require numerical answers with strict formatting (e.g., scientific notation, significant figures), while Dataset A focuses on descriptive or multi-faceted data retrieval",
        "Dataset B emphasizes physics/engineering applications (e.g., projectile kinematics, energy output), whereas Dataset A includes broader chemistry/biology/geography queries",
        "Dataset B tasks frequently combine multiple computational objectives (e.g., mass comparison + planetary day length), while Dataset A tasks typically address single objectives",
        "Dataset A contains humanities-focused queries (e.g., word etymology, historical figures) absent in Dataset B",
        "Dataset B features complex geometric/mathematical optimization problems (e.g., circle packing, curve length) not seen in Dataset A",
        "Dataset B requires precise parameterization of physical systems (e.g., step height, age/weight metrics) compared to Dataset A's general variable inputs",
        "Dataset A includes temporal data analysis (e.g., COVID-19 trends), while Dataset B focuses on instantaneous physical/mathematical calculations",
        "Dataset B tasks involve theoretical constraint definition (e.g., inequality regions) unlike Dataset A's concrete real-world applications",
        "Dataset A contains visualization requests (e.g., derivative plots) absent in Dataset B's text/number-oriented tasks",
        "Dataset B demonstrates stricter unit conversion requirements (e.g., ppi-to-memory, chemical mass-to-moles) compared to Dataset A's basic unit queries"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Dataset A tasks frequently involve exploratory information gathering (e.g., 'learn about carbon', 'browse paradoxes') while Dataset B focuses on executing predefined computational operations",
        "Dataset B requires multi-variable constraint analysis in geometry/physics problems (e.g., packing optimization, projectile kinematics) not seen in Dataset A",
        "Dataset A contains linguistic/historical queries (e.g., word etymology, historical events) absent from Dataset B's strictly STEM-focused tasks",
        "Dataset B emphasizes precise numerical formatting requirements (e.g., scientific notation with specific sig figs) not present in Dataset A",
        "Dataset A includes data export/formatting requirements (e.g., 'save in XLS format') while Dataset B focuses solely on computational results",
        "Dataset B tasks frequently combine multiple distinct computations in single queries (e.g., planetary mass + rotational period calculations)",
        "Dataset A contains health/medical information requests (e.g., cancer research, calorie intake) not found in Dataset B's physical science focus",
        "Dataset B requires coordinate system transformations (e.g., rotated conic sections) while Dataset A focuses on direct unit conversions",
        "Dataset A includes social science elements (e.g., salary data, financial metrics) absent from Dataset B's pure mathematical/physical problems",
        "Dataset B tasks demand parametric optimization analysis (e.g., 'densest packing comparison') whereas Dataset A focuses on single-solution retrieval"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Dataset B tasks require multi-step problem-solving combining multiple scientific principles (e.g., projectile motion + energy calculations)",
        "Dataset B emphasizes geometric packing optimization and spatial configuration analysis",
        "Dataset B includes explicit requests for numerical precision formatting (e.g., significant figures in scientific notation)",
        "Dataset B contains more physics-based kinematics calculations (projectile trajectories, velocity/time relationships)",
        "Dataset B tasks frequently involve comparative analysis of mathematical models/methodologies (e.g., packing densities)",
        "Dataset B requires compound unit conversions with dimensional analysis (e.g., mass\u2192moles\u2192elemental composition)",
        "Dataset B features complex metabolic calculations integrating multiple physiological parameters",
        "Dataset B includes specific engineering system performance metrics (power plant output measurements)",
        "Dataset B tasks involve geometric transformations of equations (e.g., rotated conic sections)",
        "Dataset B requires parametric curve analysis and spatial integration (curve length calculations)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Tasks require multi-step comparative analysis between specific entities or models (e.g., packing densities, food calorie comparisons).",
        "Tasks involve dynamic physical system calculations (e.g., projectile motion, energy output of power plants).",
        "Tasks demand combinatorial analysis with constraints (e.g., polyomino combinations, inequality constraints for geometric regions).",
        "Tasks explicitly require synthesizing results from multiple parameters (e.g., metabolic properties based on height, weight, activity).",
        "Tasks focus on generating mathematical expressions (e.g., integrals, rotated ellipse equations) rather than solely solving equations.",
        "Tasks involve aggregating or averaging data across multiple categories or locations (e.g., average movie ticket prices in cities).",
        "Tasks include explicit conditional requirements (e.g., 'how many have only 2 rows') within combinatorial problems.",
        "Tasks require applied physics or engineering computations (e.g., projectile velocity, energy production metrics).",
        "Tasks specify multi-part unit conversions with compositional breakdowns (e.g., sulfuric acid to moles with element percentages).",
        "Tasks integrate geometric or trigonometric transformations (e.g., rotating ellipses, curve length calculations)."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Tasks in B require specifying exact numerical thresholds (e.g., 'under 25 minutes', '4.5 stars') rather than general constraints.",
        "B tasks explicitly demand cross-referencing multiple parameters simultaneously (e.g., cooking time + rating + ingredient count).",
        "B requires outputting structured data formats (e.g., ingredient lists, nutrition tables) rather than simple recipe identification.",
        "Tasks in B mandate verification of quantitative metadata (e.g., 'more than 500 reviews', 'under 600 calories').",
        "B includes requirements for comparative analysis (e.g., 'list 3 recommended', '6 holiday sections') rather than single-result tasks.",
        "Tasks in B frequently require extraction of specific qualitative descriptors (e.g., 'primary seasoning used', 'type of dressing').",
        "B emphasizes statistical validation through review metrics (e.g., 'at least 50 reviews') as core task requirements.",
        "Tasks in B demand temporal precision (e.g., 'estimated cooking time', 'prep time of under 1 hour') beyond basic time ranges.",
        "B requires explicit content synthesis from multiple recipe components (e.g., ingredients + steps + nutrition + reviews).",
        "Tasks in B specify audience/scaling parameters (e.g., 'suitable for 6 people') not present in A's requirements."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks in dataset B require precise nutritional breakdowns (e.g., exact carb content per serving) while dataset A prioritizes general nutritional categories (e.g., 'low-carb').",
        "Dataset B tasks involve generating shopping lists from recipe ingredients, which is absent in dataset A tasks.",
        "Tasks in dataset B specify exact review count thresholds (e.g., 'more than 500 reviews') whereas dataset A uses qualitative descriptors like 'highly-rated'.",
        "Dataset B includes meal planning functionality (e.g., 'create a meal plan for 6 people') not present in dataset A tasks.",
        "Tasks in dataset B require explicit inclusion of step-by-step cooking instructions, unlike dataset A's focus on discovery/comparison.",
        "Dataset B tasks demand serving size adjustments (e.g., 'suitable for 6 people') while dataset A focuses on single-recipe retrieval.",
        "Tasks in dataset B require calorie count constraints with exact numerical limits (e.g., 'under 600 calories'), whereas dataset A mentions calorie awareness without specific thresholds.",
        "Dataset B tasks include ingredient substitution queries (e.g., 'zucchini instead of pasta') not seen in dataset A's requirements.",
        "Tasks in dataset B specify exact preparation time ranges (e.g., 'less than 1 hour') rather than dataset A's general time constraints (e.g., 'quick').",
        "Dataset B requires identification of specific recipe components (e.g., 'primary cheese used') while dataset A focuses on broader dietary categories (e.g., 'vegetarian')."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Tasks in dataset B require creating shopping lists from recipe ingredients, which is not present in dataset A.",
        "Dataset B tasks involve specifying exact numerical constraints (e.g., 'under 600 calories', 'over 500 reviews'), whereas dataset A uses qualitative thresholds (e.g., 'highly-rated').",
        "Dataset B tasks explicitly demand nutritional fact extraction (e.g., 'total carbohydrate content'), while dataset A focuses only on general nutritional information access.",
        "Tasks in dataset B require multi-criteria prioritization (e.g., '4-star rating AND zucchini AND vegetarian'), whereas dataset A focuses on single-filter requirements.",
        "Dataset B includes meta-analysis tasks (e.g., 'what the latest review says'), which require temporal understanding of user-generated content not seen in dataset A.",
        "Dataset B tasks require explicit identification of recipe sections/categories (e.g., 'list 6 holiday recipe sections'), while dataset A only involves general category navigation.",
        "Tasks in dataset B demand ingredient provenance tracking (e.g., 'primary cheese used'), whereas dataset A focuses only on ingredient presence/absence.",
        "Dataset B contains complex output formatting requirements (e.g., 'include cooking steps description'), while dataset A focuses on basic information extraction.",
        "Dataset B tasks require comparative analysis across recipe metadata (e.g., '3 recommended dinner recipes'), whereas dataset A focuses on individual recipe evaluation.",
        "Tasks in dataset B involve community feature interactions (e.g., 'Allrecipes Allstars program'), which are absent from dataset A's scope."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Dataset B tasks require explicit quantitative metrics (e.g., 'under 30 minutes', '4.5 stars') while Dataset A uses qualitative constraints (e.g., 'simple', 'detailed nutrition').",
        "Tasks in B frequently demand multi-criteria filtering (e.g., 'vegetarian + under 600 calories + <1 hour prep') whereas A tasks focus on single constraints.",
        "B emphasizes nutritional analysis requirements (e.g., 'low-carb breakfast', 'calories per serving') not present in A's tasks.",
        "B tasks specify exact review thresholds (e.g., '50+ reviews') while A only references review importance generally.",
        "Dataset B includes structured output requirements (e.g., 'create shopping list', 'include cooking steps') absent in A's open-ended tasks.",
        "B tasks require comparative analysis of recipe popularity (e.g., 'more than 500 reviews') vs. A's focus on basic discovery.",
        "Dataset B explicitly demands metadata synthesis (e.g., 'primary cheese used', 'seasoning noted') beyond A's ingredient listing needs.",
        "B tasks involve cross-referencing multiple recipe attributes (ratings+time+nutrition) while A tasks typically address single attributes.",
        "Dataset B includes explicit demographic targeting (e.g., 'high-protein vegetarian', 'vegan') vs. A's general dietary needs.",
        "B tasks require temporal precision (e.g., 'prep time under 25 minutes') whereas A uses relative time descriptions (e.g., 'quick')."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks in B require specific numerical thresholds for ratings (e.g., 4.5+ stars) and review counts (e.g., 50+ reviews), while A uses general thresholds (e.g., 4+ stars).",
        "B tasks explicitly demand structured outputs like ingredient lists, cooking steps, or nutrition facts, whereas A focuses on basic recipe retrieval.",
        "B includes multi-step actions (e.g., 'create a shopping list' + 'include cooking time'), while A tasks are single-step (e.g., 'find a recipe').",
        "B tasks frequently specify exact calorie limits (e.g., 'under 600 calories/serving'), while A only generically mentions 'nutritional information'.",
        "B requires cross-referencing multiple filters simultaneously (e.g., dietary + time + rating constraints), while A uses individual filters separately.",
        "B emphasizes quantitative analysis of community feedback (e.g., 'over 500 reviews'), while A focuses on qualitative engagement (e.g., 'leave a review').",
        "B tasks demand precise identification of recipe components (e.g., 'primary cheese used'), whereas A tasks accept general categorical matches (e.g., 'vegetarian').",
        "B includes explicit output formatting requirements (e.g., 'list 3 recipes', 'provide brief description'), while A has open-ended task phrasing.",
        "B tasks require verification of recipe scaling (e.g., 'suitable for 6 people'), while A focuses on individual/family portion discovery.",
        "B specifies temporal constraints in exact increments (e.g., 'under 25 minutes'), while A uses broader time ranges (e.g., 'under 30 minutes')."
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Tasks in Dataset B require providing International Phonetic Alphabet (IPA) notations for pronunciations.",
        "Dataset B tasks involve identifying and listing multiple distinct meanings of a single word (e.g., 'harmony').",
        "Dataset B includes tasks requiring conversion of the website's interface language (e.g., English to Deutsch).",
        "Dataset B tasks require browsing and listing items from the Shop section, absent in Dataset A tasks.",
        "Dataset B tasks explicitly ask users to count the number of definitions/meanings provided for a word (e.g., 'unblemished').",
        "Dataset B tasks mandate interaction with the Word Scramble game in the '+Plus' section with specific gameplay instructions.",
        "Dataset B tasks require detailed exploration of grammar sub-sections (e.g., articles, modal verbs) with example-based applications.",
        "Dataset B tasks involve translating specific words into designated languages (e.g., English\u2013Chinese) beyond general translation features.",
        "Dataset B tasks consistently require inclusion of example sentences as part of answers, even for pronunciation queries.",
        "Dataset B tasks focus on granular grammatical concepts like countable/uncountable nouns, whereas Dataset A covers broader topics."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Dataset B tasks require changing website interface language (e.g., to Deutsch) as part of navigation steps",
        "Dataset B includes specific tasks involving interaction with the Cambridge Dictionary Shop section",
        "Dataset B tasks frequently require counting exact numbers of definitions/meanings for words",
        "Dataset B contains explicit requirements to provide translations in specific language directions (e.g., English-Chinese)",
        "Dataset B tasks demand structured list responses (e.g., 'list 3 items') rather than single answers",
        "Dataset B's grammar tasks focus on precise syntactic structures (e.g., passive voice formation)",
        "Dataset B includes cookie consent management tasks with 'Do Not Sell' options",
        "Dataset B tasks reference newer lexical entries (e.g., 'healthwashing') in word exploration",
        "Dataset B requires interaction with ranked 'Popular searches' content sections",
        "Dataset B tasks incorporate both interface customization and content retrieval in single workflows"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks in dataset B require changing the website's interface language (e.g., to Deutsch), while dataset A does not include language conversion tasks.",
        "Dataset B tasks involve interacting with non-dictionary sections like the Shop (e.g., 'list 3 items'), absent in dataset A.",
        "Tasks in B explicitly require quantitative answers (e.g., 'give the number directly'), whereas A focuses on qualitative outputs.",
        "Dataset B includes step-by-step instructions for dynamic content (e.g., 'try the first example' in games), while A's tasks are more general.",
        "Tasks in B demand reporting exact IPA notation for pronunciation, whereas A may not emphasize strict formatting.",
        "Dataset B tasks require translating words into specific languages (e.g., Chinese) with explicit reporting, while A uses broader translation comparisons.",
        "B tasks involve site-wide language settings changes (e.g., homepage conversion), whereas A focuses on dictionary translation directions.",
        "Tasks in B specify detailed grammar structure exploration (e.g., 'modal verbs for possibility'), while A's grammar tasks are more general.",
        "Dataset B requires listing multiple items (e.g., '3 Shop items'), adding a quantitative layer absent in A.",
        "Tasks in B include combined UK/US English elements in single instructions, while A may separate them contextually."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Tasks in B require providing both UK and US pronunciations with example sentences in a single task, whereas A's tasks address these elements separately",
        "B's tasks explicitly ask for the number of word meanings, a requirement not present in A",
        "B includes tasks that involve converting the website's interface language (e.g., to Deutsch), which A's tasks do not",
        "Tasks in B necessitate detailed comparisons between grammar concepts (e.g., 'fewer' vs. 'less'), while A's grammar tasks are more general",
        "B's tasks require combining pronunciation, definition, and example sentence lookup in one step, unlike A's segmented approach",
        "B includes specific translation tasks to languages like Chinese, whereas A's translations are more varied and less targeted",
        "Tasks in B involve navigating to specific grammar sections (e.g., passive voice), beyond A's general grammar navigation",
        "B's tasks focus on explaining modal verbs for expressing possibility, a specific grammar aspect not emphasized in A",
        "B's tasks include browsing the Shop section and listing items, absent in A's tasks",
        "B's tasks require changing translation language directions, while A's translation tasks are unidirectional"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Dataset B tasks require users to report the number of distinct word meanings (e.g., 'how many meanings of unblemished') as part of objectives",
        "Dataset B includes tasks involving interactive word games (e.g., Word Scramble) accessible through the Plus section",
        "Dataset B requires language conversion of the entire interface (e.g., 'Convert homepage to Deutsch') rather than just translation functionality",
        "Dataset B contains tasks explicitly requiring exploration of commercial products (e.g., 'browse Cambridge Dictionary Shop section')",
        "Dataset B emphasizes identification of modal verb usage patterns (e.g., 'might', 'could') in grammar explanations",
        "Dataset B tasks specifically target passive voice construction analysis in grammar sections",
        "Dataset B requires direct numerical reporting of linguistic feature counts (e.g., 'give the number directly' for meanings)",
        "Dataset B includes explicit requirements to compare/contrast grammatical forms (e.g., comparative vs superlative adjective rules)",
        "Dataset B tasks involve interface language settings modification rather than just content translation",
        "Dataset B contains objectives requiring interaction with commercial/monetization features (Shop section)"
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Dataset B tasks focus on specific hardware configurations (e.g., 64GB storage, Wi-Fi + Cellular) while Dataset A emphasizes general model comparisons",
        "Dataset B requires precise price calculations for customized builds whereas Dataset A focuses on base model price comparisons",
        "Dataset B tasks involve checking local availability/in-store pickup options absent in Dataset A requirements",
        "Dataset A includes tasks requiring navigation to technical support/documentation sections not emphasized in Dataset B",
        "Dataset B contains explicit requests for physical device measurements (weight/size) not present in Dataset A samples",
        "Dataset A tasks frequently involve environmental/sustainability information not required in Dataset B objectives",
        "Dataset B requires identification of specific hardware components (e.g., 40-core GPU) while Dataset A focuses on chip types generally",
        "Dataset A includes tasks about accessory compatibility research absent in Dataset B's product-focused requirements",
        "Dataset B tasks demand exact color option comparisons across multiple generations not seen in Dataset A",
        "Dataset A contains time-sensitive information retrieval tasks (newest models) while Dataset B focuses on current product configurations"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks require exact hardware specification lookup (e.g., M3 Max chip configuration, 64GB unified memory)",
        "Price calculation workflows demand explicit exclusion of accessories/engraving options",
        "Product comparisons require direct generation of tabular feature matrices between 3+ device generations",
        "Tasks involve precise OS version compatibility checks (e.g., iOS 17 with iPhone 12)",
        "Configuration tasks require mathematical calculation of upgrade pricing differentials",
        "Technical specification extraction focuses on physical dimensions/weight measurements",
        "Product research requires identification of maximum technical capabilities (e.g., video recording resolution)",
        "Tasks demand explicit confirmation of accessory exclusion in purchase workflows",
        "Comparison requirements include color option analysis across multiple product generations",
        "Feature verification tasks require listing specific quantities (e.g., 5 built-in apps)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks require specifying exact storage capacities (e.g., 64GB, 1TB) without optional accessories like engraving or Apple Pencil.",
        "Tasks focus on identifying compatibility of software updates (e.g., iOS 17) with specific device models (e.g., iPhone 12).",
        "Tasks involve checking product availability for in-store pickup on specific future dates (e.g., January 10, 2024).",
        "Tasks require precise color comparisons across multiple product generations (e.g., iPhone 13 Pro, 14 Pro, 15 Pro).",
        "Tasks necessitate locating accessory availability near specific geographic locations (e.g., zip code 90038).",
        "Tasks focus on technical specifications of peripheral devices (e.g., Siri Remote features for Apple TV 4K).",
        "Tasks require explicit exclusion of add-ons (e.g., no Smart Folio, no trade-in) during configuration.",
        "Tasks involve calculating price differences between base models and fully upgraded configurations (e.g., MacBook Pro 14-inch with M3 chip).",
        "Tasks focus on identifying maximum technical capabilities (e.g., video recording resolution for iPad mini).",
        "Tasks require determining current-generation product slogans or marketing taglines (e.g., Apple Watch Series)."
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks require exact technical specifications (e.g., 16-core CPU, 64GB memory) rather than general feature inquiries.",
        "Dataset B emphasizes precise price calculation for fully customized configurations (e.g., no engraving/trade-ins) rather than basic upgrade costs.",
        "Dataset B focuses on physical product attributes (weight, dimensions, color options) more than service integration aspects.",
        "Dataset B requires direct comparison of consecutive model generations (iPhone 14 Pro vs 15 Pro) rather than cross-category comparisons.",
        "Dataset B includes time-sensitive availability checks (specific pickup dates) not present in Dataset A tasks.",
        "Dataset B tasks demand identification of maximum technical capabilities (e.g., video resolution limits) rather than general feature verification.",
        "Dataset B focuses on current/latest model identification ('as of today's date') rather than legacy device support.",
        "Dataset B requires geographic-specific availability checks (zip code-based retail searches) absent in Dataset A.",
        "Dataset B tasks involve detailed accessory compatibility checks (specific iPad folio models) rather than general accessory finding.",
        "Dataset B emphasizes hardware component comparisons (chip types, GPU cores) over software/service comparisons"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks in B require explicit specification of hardware configurations (e.g., '64GB unified memory, 1TB SSD') rather than general model comparisons",
        "B contains time-sensitive availability checks requiring date-specific actions (e.g., 'schedule in-store pickup for January 10, 2024')",
        "B includes precise location-based searches using zip codes for accessory availability checks",
        "Tasks in B demand explicit exclusion of optional features/add-ons (e.g., 'no engraving, no apple pencil') in price calculations",
        "B requires identification of specific peripheral device features (e.g., 'Siri Remote features') not present in A's tasks",
        "B contains explicit requests for numerical performance metrics (e.g., 'maximum video recording resolution') rather than qualitative descriptions",
        "Tasks in B require comparison across 3+ product generations (e.g., iPhone 13/14/15 Pro) instead of 2-generation comparisons",
        "B includes explicit version compatibility checks between hardware and OS versions (e.g., 'iOS 17 compatibility with iPhone 12')",
        "Tasks in B demand component-level upgrade analysis (e.g., 'calculate total price difference from base to max upgrade')",
        "B requires identification of pre-installed software status in configuration tasks (e.g., 'no Pre-Installed Software' specifications)"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks in B require immediate real-time numerical data (e.g., live player counts, current pollution levels) while A focuses on current but non-live data (e.g., prices, news).",
        "B tasks demand explicit ranking identification (e.g., 'top 3', 'top 10') whereas A emphasizes comparative analysis without strict ordinal rankings.",
        "B requires direct extraction of quantities (e.g., 'how many in Asia?') while A tasks focus on qualitative comparisons (e.g., recipe variations).",
        "B includes time-bound retroactive queries (e.g., 'club membership from 2020\u20132021') whereas A focuses on present or forward-looking information.",
        "B tasks involve manual data transfer actions (e.g., 'copy SHA \u2192 paste \u2192 verify') absent in A's procedural workflows.",
        "B requires cross-platform validation (e.g., 'IMDb AND Rotten Tomatoes ratings') while A uses single-source verification (e.g., Allrecipes.com comparisons).",
        "B emphasizes astronomical/geographic measurements (e.g., distance to Mars, mountain elevation) unlike A's product/service specifications.",
        "B tasks target version-specific technical requirements (e.g., 'latest Adobe Photoshop on Mac') while A focuses on general feature comparisons.",
        "B requires parsing condensed summaries (e.g., 'brief introduction to planets') whereas A tasks involve deeper content exploration (e.g., research papers).",
        "B prioritizes pop culture metrics (e.g., Billboard charts, movie release dates) while A emphasizes academic/professional domains (e.g., healthcare, engineering jobs)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks consistently require retrieving exact numerical values or counts (e.g. player numbers, elevation figures, ranking positions) while Dataset A focuses on broader numerical ranges or qualitative metrics",
        "Dataset B queries emphasize timestamp-sensitive verification (e.g. 'as of today's date', 'latest commit', 'current statistics') more explicitly than Dataset A",
        "Dataset A contains tasks requiring interaction with persistent user profiles/accounts (e.g. course progress, recipe ratings, translation history) unlike Dataset B",
        "Dataset B shows stronger emphasis on ordered rankings (top-3, top-10) and positional identifiers (no. 1, most recent) compared to Dataset A's more general comparisons",
        "Dataset A includes transactional objectives (booking tickets, applying for jobs, purchasing items) absent in Dataset B's purely informational tasks",
        "Dataset B tasks frequently require working with platform-specific identifiers (IMDb IDs, GitHub SHAs, Billboard charts) more than Dataset A",
        "Dataset A contains more health/medical information retrieval tasks (symptoms, treatments, risk factors) not present in Dataset B",
        "Dataset B shows higher prevalence of astronomical/geographical measurement tasks (distances, elevations) compared to Dataset A",
        "Dataset A includes content creation/curation tasks (adding recipe ingredients, rating systems) while Dataset B focuses solely on extraction",
        "Dataset B requires more explicit temporal precision (specific seasons, single-year records, exact release dates) than Dataset A's relative time references"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset A tasks focus on exploratory research and understanding concepts (e.g., 'effects of climate change', 'machine learning definitions'), while Dataset B emphasizes retrieving exact real-time metrics (e.g., 'current player counts', 'today's Earth-Mars distance')",
        "Dataset A includes transactional objectives requiring form interactions (e.g., job applications, museum bookings), whereas Dataset B tasks lack transactional elements and focus purely on information retrieval",
        "Dataset B tasks explicitly demand time-sensitive qualifiers like 'current', 'latest', or 'as of today' in 100% of samples, while Dataset A uses broader temporal ranges (e.g., '2024 trends', 'recent papers')",
        "Dataset A contains tasks requiring content synthesis from multiple sources (e.g., comparing stock performances across platforms), while Dataset B focuses on single-source verification (e.g., IMDb ratings, GitHub SHA hashes)",
        "Dataset B exclusively includes tasks requiring direct numerical extraction (e.g., 'top 3 planets', '10 most played songs'), whereas Dataset A allows for qualitative answers (e.g., 'explore event venue options')",
        "Dataset A tasks frequently target academic/professional domains (e.g., research papers, university programs), while Dataset B emphasizes pop culture/sports metrics (e.g., movie ratings, athlete statistics)",
        "Dataset B contains explicit verification steps in 20% of tasks (e.g., 'paste and tell me what the SHA is'), whereas Dataset A verification is implicit in multi-step processes",
        "Dataset A includes tasks requiring knowledge contribution (e.g., Wikipedia edits, recipe database additions) absent in Dataset B",
        "Dataset B tasks specify platform-restricted answers (e.g., 'through a blog', 'on IMDb') in 30% of samples, while Dataset A allows open-source solutions",
        "Dataset A contains comparative analysis across domains (e.g., tech vs biotech stocks), while Dataset B comparisons are intra-domain (e.g., American vs British English differences)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Tasks in dataset B require real-time or live data retrieval (e.g., current player counts, today's planetary distances).",
        "Dataset B tasks demand exact numerical outputs (e.g., rankings like 'top 3,' player counts, pollution statistics).",
        "B tasks frequently involve platform-specific data extraction (e.g., IMDb ratings, GitHub commit SHAs).",
        "Dataset B requires cross-platform comparisons (e.g., movie ratings on IMDb vs. Rotten Tomatoes).",
        "Tasks in B emphasize generating ranked lists (e.g., 'top 10 destinations,' 'most played songs').",
        "Dataset B tasks often require aggregation or counting within results (e.g., 'how many in Asia').",
        "B tasks focus on retrieving the latest version or release-specific details (e.g., software requirements).",
        "Dataset B includes tasks requiring exact identifiers (e.g., commit SHAs, release dates).",
        "B tasks prioritize measurable metrics (e.g., elevation, hardware specs) over qualitative information.",
        "Dataset B tasks universally require answers tied to the most up-to-date information available at query time."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Dataset B tasks require real-time or live data retrieval (e.g., current player counts, today's distance measurements) whereas Dataset A focuses on up-to-date but non-live information (e.g., hotel prices, news articles).",
        "Dataset B tasks explicitly demand quantification of results (e.g., 'top 3', '10 most played songs') while Dataset A requests numerical data without ranked enumeration.",
        "Dataset B includes tasks requiring identification of positional rankings (e.g., 'top-10 trending destinations', 'no. 1 artist') unlike Dataset A's comparative analysis without ordinal specificity.",
        "Dataset B tasks target platform-specific live metrics (e.g., Steam concurrent players, GitHub commit SHA) whereas Dataset A focuses on multi-platform information aggregation.",
        "Dataset B emphasizes astronomical/geographical factual queries (e.g., planetary data, mountain elevation) absent in Dataset A's health/education-oriented tasks.",
        "Dataset B requires extraction of exact temporal data (e.g., 'release date as of today', 'most recent final date') while Dataset A uses relative timeframes (e.g., 'latest news').",
        "Dataset B tasks involve sports/movie domain-specific leaderboards (e.g., UEFA Champions outcomes, Billboard charts) more frequently than Dataset A's general comparisons.",
        "Dataset B contains explicit cross-platform data validation (e.g., compare IMDb/Rotten Tomatoes scores) whereas Dataset A focuses on single-source verification.",
        "Dataset B tasks demand list enumeration constraints (e.g., 'how many in Asia', '10 songs') unlike Dataset A's open-ended numerical requests.",
        "Dataset B includes precise bio/metric retrieval (e.g., athlete goal records, pollution statistics) while Dataset A focuses on explanatory content retrieval."
      ]
    }
  }
}