{
  "sims": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Tasks require searching for specific locations (e.g., restaurants, hotels, landmarks) with granular criteria (e.g., ratings, accessibility, hours).",
        "Navigation tasks involve filtering results by proximity to geographic markers (e.g., zip codes, intersections, landmarks).",
        "Tasks demand extracting detailed attributes like ratings, pricing, amenities, or user reviews from search results.",
        "Queries often include conditional logic (e.g., 'open now but not 24 hours,' 'not open 24 hours,' 'moderately priced').",
        "Tasks involve route planning (e.g., driving, walking, transit) between two or more points with specific constraints.",
        "Users seek real-time or dynamic information (e.g., traffic conditions, operational hours, availability).",
        "Tasks require comparing multiple results (e.g., 'list three bus stops,' 'find 5 beauty salons,' 'compare hotels').",
        "Queries focus on validating or retrieving user-generated content (e.g., reviews, ratings, review summaries).",
        "Tasks involve interactive actions like making reservations, booking accommodations, or sharing map links.",
        "Users prioritize accessibility features (e.g., wheelchair accessibility, parking types, transit options) in search criteria."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Tasks require searching for specific business types (e.g., restaurants, hotels, parking) with location constraints.",
        "Queries involve filtering results by user-defined criteria like ratings (e.g., '4.6+ stars') or operating hours ('open now').",
        "Navigation tasks frequently use landmarks (e.g., Empire State Building, Eiffel Tower) or zip codes as geographic anchors.",
        "Users seek directions/route planning between locations (e.g., cities, landmarks, or user-defined points).",
        "Tasks require parsing business details: operating hours, services offered (e.g., 'not open 24 hours'), and amenities.",
        "Requests involve analyzing user reviews for specific insights (e.g., one-star review content, review trends).",
        "Queries prioritize proximity-based searches (e.g., 'nearest to X', 'within 2 miles of Y').",
        "Tasks demand multi-step actions (e.g., find location \u2192 check reviews \u2192 get directions).",
        "Users request quantitative results (e.g., 'how many results', 'list three', '5 beauty salons').",
        "Queries include accessibility-focused needs (e.g., wheelchair-accessible routes, parking types)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Tasks involve searching for specific locations or services (e.g., restaurants, hotels, parking) with granular filters (e.g., ratings, hours, accessibility).",
        "Users frequently request directions or routes between points, including multi-stop itineraries.",
        "Queries require filtering results by real-time or temporal constraints (e.g., 'open now,' availability dates).",
        "Tasks demand comparison or evaluation of results based on user reviews (e.g., ratings, review content).",
        "Users refine searches using proximity criteria (e.g., 'nearest to,' 'within X miles').",
        "Queries often combine multiple constraints (e.g., price range + accessibility + rating).",
        "Tasks involve validating operational details (e.g., hours, service availability, pricing).",
        "Users seek location-specific amenities (e.g., EV charging, wheelchair access, parking types).",
        "Tasks require parsing structured information (e.g., maps, transit schedules, business details).",
        "Queries frequently include multi-step actions (e.g., search \u2192 filter \u2192 verify \u2192 summarize)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Tasks require searching for specific locations (e.g., restaurants, hotels, parking) with filters like ratings, price range, or operating hours.",
        "Tasks involve retrieving detailed information about places, such as reviews, accessibility features, or amenities.",
        "Tasks demand generating directions or routes between multiple locations (e.g., cities, landmarks, or addresses).",
        "Tasks utilize geographical constraints (e.g., zip codes, proximity to landmarks, or city boundaries).",
        "Tasks require checking real-time or dynamic information (e.g., open/closed status, traffic conditions, availability).",
        "Tasks involve comparing multiple results (e.g., identifying nearest options, filtering by user ratings).",
        "Tasks include multi-step actions (e.g., finding a location first, then extracting specific details or directions).",
        "Tasks focus on accessibility or specific amenities (e.g., wheelchair accessibility, EV charging, free Wi-Fi).",
        "Tasks leverage user-generated content (e.g., reviews, photos, or ratings) for decision-making.",
        "Tasks incorporate both local and international locations, emphasizing granular geographical navigation."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Tasks require searching for specific business types (e.g., restaurants, hotels, parking) with granular filters like ratings, hours, or amenities",
        "Navigation involves proximity-based queries (e.g., 'nearest to', 'within X miles') using geographic markers like addresses or landmarks",
        "Multi-step actions combine location discovery with information extraction (e.g., find place then read reviews)",
        "Route planning tasks specify transportation modes (walking, biking, public transit) between defined start/end points",
        "Requests emphasize real-time status checks (e.g., 'open now', current traffic/weather)",
        "Tasks require parsing and summarizing structured data from listings (price ranges, review contents, operating hours)",
        "Queries utilize hierarchical filters (e.g., 'Italian restaurants with 4.8+ rating but not 24-hour')",
        "Location specifications combine multiple geographic identifiers (zip codes, street intersections, relative landmarks)",
        "Accessibility requirements are explicitly stated in search criteria (e.g., wheelchair-accessible routes)",
        "Tasks involve comparative analysis across results (e.g., 'cheapest option', 'least busy level')"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks involve searching for repositories with specific criteria (e.g., programming language, stars, update recency).",
        "Tasks require comparing features or limitations across GitHub plans (e.g., Free vs. Pro, Copilot pricing).",
        "Tasks focus on locating security-related information (e.g., vulnerabilities, advisories, Advanced Security features).",
        "Tasks involve navigating to product pages (e.g., Copilot, Codespaces, Projects) to identify features.",
        "Tasks require interacting with GitHub\u2019s educational or onboarding resources (e.g., GitHub Skills, courses).",
        "Tasks include identifying customer stories or case studies from the GitHub homepage or dedicated sections.",
        "Tasks demand exploration of project management tools (e.g., Issues, Projects, Actions workflows).",
        "Tasks involve account-related actions (e.g., sign-up, privacy policy, terms of service).",
        "Tasks require locating open-source contributions or trending repositories (e.g., top contributors, popular projects).",
        "Tasks focus on compliance and policy details (e.g., security certifications, data protection, licensing)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks involve searching/filtering repositories by programming language, stars, or recency",
        "Tasks require comparing pricing/features across different GitHub plans (Free, Pro, Enterprise)",
        "Tasks focus on GitHub Copilot functionality, pricing, and data usage policies",
        "Tasks involve locating security-related features (advisories, Advanced Security, Dependabot)",
        "Tasks require navigating API documentation (GraphQL, REST, GitHub Actions)",
        "Tasks involve finding customer stories/case studies across industries",
        "Tasks require interaction with search functionality for repositories/users/topics",
        "Tasks involve checking repository metadata (commits, contributors, files changed)",
        "Tasks require locating developer resources (Skills, CLI, Desktop, Mobile)",
        "Tasks involve finding official documentation for GitHub features/products"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks involve searching/filtering repositories by criteria (stars, update date, language).",
        "Tasks require navigating to product features/pricing pages (Copilot, Plans, Enterprise).",
        "Tasks include comparing different service tiers/pricing structures.",
        "Tasks involve locating specific documentation (security policies, feature guides, educational resources).",
        "Tasks require interacting with authentication flows (sign-up, account creation, email verification).",
        "Tasks focus on technical asset discovery (repositories, code samples, CLI tools).",
        "Tasks involve analyzing project metadata (contributors, commit history, release versions).",
        "Tasks require navigating between multiple platform sections (Marketplace, Skills, Trends, Customer Stories).",
        "Tasks involve understanding developer tools integration (Actions, Codespaces, Mobile, CLI).",
        "Tasks require interpreting platform-specific terminology (stars, forks, workflows, Dependabot)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks involve navigating GitHub's sign-up and authentication processes (e.g., checking email existence, signing up for trials).",
        "Tasks require searching/filtering repositories by language, stars, recency, or tags (e.g., Python, quantum computing, web scraping).",
        "Tasks involve comparing GitHub plans/pricing tiers (Free vs Pro vs Enterprise) and their features.",
        "Tasks require locating/analyzing repository metadata (stars, contributors, commit history, recent changes).",
        "Tasks involve exploring GitHub Copilot features, pricing, and plan comparisons.",
        "Tasks require finding security-related information (CVEs, vulnerabilities, Dependabot, Advanced Security).",
        "Tasks involve navigating customer stories/case studies and extracting specific details.",
        "Tasks require interacting with GitHub's project management features (Issues, Projects, Milestones).",
        "Tasks involve comparing/upgrading between free and paid tiers of GitHub services.",
        "Tasks require locating documentation/policies (terms, privacy, API integration, Git best practices)."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Tasks involve searching for repositories filtered by programming language, stars, and recent update dates",
        "Tasks require navigating to and interpreting pricing plan details (e.g., Free vs Pro plans)",
        "Tasks involve exploring GitHub Copilot features and subscription options",
        "Tasks require locating customer success stories/case studies",
        "Tasks involve checking security vulnerabilities/advisories and remediation features",
        "Tasks require form interactions (email input, sign-up flows)",
        "Tasks involve navigating documentation for project management features",
        "Tasks require comparing different GitHub products/services (Actions vs Projects vs Packages)",
        "Tasks involve repository analysis including contributors and commit histories",
        "Tasks require understanding GitHub's API capabilities (REST vs GraphQL)"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Tasks focus on retrieving real-time or recent game scores across multiple sports leagues",
        "User intents include checking team standings, rankings, and conference positions",
        "Navigation requires accessing player-specific statistics including points, assists, and career metrics",
        "Tasks involve locating injury reports and player availability status for teams",
        "Users frequently seek game schedules including dates, times, and broadcast information",
        "Navigation patterns include finding news updates about trades, signings, and roster changes",
        "Tasks require differentiation between live games, completed matches, and upcoming events",
        "Users need to compare betting odds and fantasy sports recommendations",
        "Navigation involves switching between professional leagues (NBA, NFL) and college sports (NCAAM, NCAAW)",
        "Tasks require identifying streaming/watch options through ESPN platforms (ESPN+, Gamecast)"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Tasks involve retrieving real-time sports scores and game summaries from leagues like NBA, NHL, and NCAA.",
        "Users navigate to access team-specific data (e.g., standings, player stats, injuries) across multiple sports.",
        "Interaction with dynamic content such as live game updates, playoff brackets, and tournament challenges is required.",
        "Navigation includes accessing multimedia elements (e.g., team logos, game highlights) embedded in articles or scoreboards.",
        "Tasks require hierarchical navigation through structured menus (e.g., from league overviews to specific games/teams).",
        "Search functionality is utilized to locate articles, player profiles, or specific event details.",
        "Users filter/sort data (e.g., standings by conference, player stats by performance metrics).",
        "Hyperlinks to related content (e.g., game recaps, fantasy tools, betting odds) are frequently used.",
        "Account-based features (e.g., Tournament Challenge brackets, ESPN+ subscriptions) are integrated into tasks.",
        "Consistent navigation patterns for accessing schedules, scores, and news across different sports sections."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Tasks involve retrieving real-time or final game scores across multiple sports leagues (e.g., NBA, NFL, NCAA).",
        "Navigation requires accessing team-specific pages via hierarchical menus (e.g., league \u2192 team \u2192 game details).",
        "Users frequently check player/team statistics (e.g., points scored, standings, seasonal performance).",
        "Tasks include filtering results by date (e.g., games on December 25, Week 18, specific seasons).",
        "Cross-sport navigation is common (e.g., switching between NBA, NCAA football, and soccer content).",
        "Users search for injury reports or player availability (e.g., Philadelphia 76ers' latest injuries).",
        "Tasks involve comparing scores/stats between teams (e.g., Cavaliers vs. Thunder, Lakers vs. Celtics).",
        "Navigation includes accessing playoff/bracket information (e.g., CFP schedule, March Madness brackets).",
        "Users retrieve schedules for upcoming or past games (e.g., NFL Week 17, Christmas Day NBA games).",
        "Tasks require parsing dynamic content like odds, rankings, and fantasy sports data (e.g., top odds, fantasy rankings)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Both datasets involve tasks requiring navigation to live or final game scores across multiple sports leagues (NBA, NHL, NCAA).",
        "Tasks in both datasets require identification of team abbreviations and associated scores within game summaries.",
        "Both include navigation to player/team statistics pages (e.g., salary info, performance stats, standings).",
        "Users must locate time-sensitive information like quarter/game clock statuses (e.g., 'End of 3rd', '11:28 - 2nd').",
        "Both require interaction with tournament/bracket challenges (Men's/Women's Tournament Challenge links).",
        "Tasks involve filtering results by specific dates (e.g., 'December 25, 2023', 'Thu 2:30 AM UTC').",
        "Both require differentiation between completed games ('Final') and in-progress games with live updates.",
        "Navigation to conference-specific standings (e.g., 'Eastern Conference', 'America East Conference') is required in both.",
        "Both involve cross-referencing team records (e.g., '21-54', '63-12') during score/schedule lookups.",
        "Tasks require identification of broadcast/streaming platforms (ESPN+, TNT, MLB Net) associated with games."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Tasks require retrieving real-time or recent sports game scores and results.",
        "Navigation involves accessing team standings, rankings, and conference-specific data.",
        "Queries focus on player-specific statistics, injuries, and performance metrics.",
        "Users locate schedules for upcoming games and events across multiple sports leagues.",
        "Tasks utilize search functionality to find teams, players, or specific games.",
        "Interaction with fantasy sports sections and tournament challenge brackets is common.",
        "Retrieval of news articles and updates on trades, drafts, and league developments is required.",
        "Accessing multimedia content like live streams and highlights via ESPN+ is a key task.",
        "Information filtering by dates, leagues, or teams is necessary for task completion.",
        "Navigation targets league-specific pages (NBA, NFL, NCAA) for detailed insights."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Tasks require navigating structured content categories (models, datasets, spaces, posts)",
        "Users must interact with search functionality for specific ML resources",
        "Tasks involve extracting metadata (update dates, download counts, model metrics)",
        "Navigation requires understanding of hierarchical organization (model/creator/version structures)",
        "Tasks demand comparison of technical specifications (model sizes, tensor types, architectures)",
        "Users must locate API documentation and implementation examples",
        "Tasks require filtering/sorting by recency, popularity, or modality (text/image/audio)",
        "Navigation involves cross-referencing between models, datasets, and related applications",
        "Tasks necessitate understanding licensing information and usage restrictions",
        "Users must identify community engagement features (discussions, collaborations, posts)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Tasks involve searching for specific models or datasets by name, functionality, or modality (e.g., text, image, audio).",
        "Users must retrieve metadata such as model size, update dates, download counts, licensing, or performance metrics.",
        "Navigation requires interacting with API documentation (e.g., Inference API, Trainer API) for usage examples or integration.",
        "Tasks focus on identifying trending, recent, or most-downloaded resources across models, datasets, and applications.",
        "Users explore pricing tiers (e.g., Pro account, enterprise solutions) and compare feature sets like GPU access or support.",
        "Navigation involves filtering or sorting models/datasets by criteria like library dependencies (e.g., PaddlePaddle) or task specialization.",
        "Tasks require locating open-source projects, including architecture details, intended use cases, and contributor information.",
        "Users must cross-reference model capabilities with benchmarks (e.g., MMMU) or research papers for performance validation.",
        "Navigation includes accessing community resources like forums, blogs, or Spaces for tutorials, demos, or collaborative workflows.",
        "Tasks involve verifying licensing restrictions, usage guidelines, or attribution requirements for commercial/non-commercial applications."
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Tasks involve searching/filtering models or datasets by attributes like task type, library, or modality (e.g., text-to-image, NLP).",
        "Users frequently retrieve model metadata such as download counts, update timestamps, creator names, and performance metrics.",
        "Navigation requires interacting with API endpoints (e.g., Inference API for text/image generation, model-specific APIs).",
        "Tasks involve accessing documentation for implementation guidance (e.g., Trainer API usage, installation instructions).",
        "Users compare popularity metrics (e.g., 'most downloaded' datasets/models, trending rankings).",
        "Navigation includes exploring enterprise features like pricing tiers, commercial licenses, or enterprise-grade security.",
        "Tasks require identifying recent or updated resources (e.g., 'last updated within March 2023', 'released in the past month').",
        "Users navigate community contributions like Spaces applications, blog posts, or open-source projects.",
        "Tasks involve cross-referencing academic resources (e.g., research papers, technical reports linked to models).",
        "Navigation includes validating licensing terms, usage restrictions, or compatibility for commercial integration."
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Tasks involve searching for models/datasets by specific technical attributes (e.g., modalities, library dependencies, or performance metrics)",
        "Navigation requires interaction with categorized repositories (Models/Datasets/Spaces) through standardized filters",
        "Users frequently retrieve metadata like model size, tensor type, update timestamps, and download statistics",
        "Tasks demand API utilization exploration (Inference API, Trainer API) with parameter configuration requirements",
        "Documentation navigation patterns exist for Hugging Face tools (Transformers, Diffusers, PEFT) across both datasets",
        "Performance comparison tasks require cross-referencing model cards, benchmark results, and usage statistics",
        "Dataset interactions involve format analysis (multilingual support, annotation types) and licensing checks",
        "Tasks require temporal filtering (new/recent models) and version control awareness (base models vs. derivatives)",
        "Enterprise feature exploration patterns exist (pricing tiers, GPU deployment, SSO integration)",
        "Cross-modal linking observed between models, datasets, Spaces demos, and research papers"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Tasks involve searching for models/datasets by attributes like popularity, recency, or modality",
        "Users need to navigate API documentation for inference/training tasks",
        "Queries require understanding model metadata (download counts, update dates, performance metrics)",
        "Tasks involve comparing/contrasting multiple models/datasets",
        "Navigation requires understanding hierarchical organization of ML resources (Models > NLP > Translation)",
        "Users must interpret technical specifications (model architectures, tensor types, library dependencies)",
        "Tasks involve cross-referencing between models, datasets, and documentation",
        "Queries require temporal awareness (latest models, recent updates, version comparisons)",
        "Tasks demand understanding of licensing/compliance information (model usage terms)",
        "Navigation patterns involve alternating between search functions and browsing curated lists"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Tasks require searching/filtering courses by specific criteria (e.g., skill level, duration, rating, language)",
        "Navigation includes finding career-oriented roles (e.g., Data Analyst, UX Designer) with associated credentials",
        "Users must identify course details like instructors, institutions, learning outcomes, and skills developed",
        "Tasks involve comparing professional certificates/specializations (e.g., Google vs. IBM programs)",
        "Focus on platform features: Coursera Plus discounts, free courses, and degree program pathways",
        "Requires interaction with partner institution listings (e.g., Stanford, University of Michigan, Google)",
        "Includes verification of course metrics: median salaries, job availability, and credential counts",
        "Tasks demand exploration of AI/ML-related content and certifications (e.g., Generative AI, Prompt Engineering)",
        "Navigation involves filtering by domain (e.g., Business, Data Science, Health) and format (Guided Projects)",
        "Users must locate time-bound promotional offers (e.g., $199/year deals) and specialized collections"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Tasks require searching/filtering courses by specific criteria (rating, duration, level)",
        "Tasks involve extracting detailed course metadata (instructors, reviews, prerequisites)",
        "Both include queries about course credentials/certifications (Professional Certificates, Specializations)",
        "Tasks focus on identifying course providers (universities/companies offering content)",
        "Both require navigation through hierarchical course structures (modules, assessments, projects)",
        "Tasks demand price/cost comparisons (free courses vs paid certificates)",
        "Both involve career-oriented queries (job-ready skills, salary data, role requirements)",
        "Tasks require comparison between different learning programs/paths",
        "Both include language/accessibility requirements (course language, experience level)",
        "Tasks involve validation of learning outcomes and credential recognition"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Both datasets require users to search/filter courses by specific criteria (skill level, duration, institution, ratings, free status)",
        "Tasks in both datasets involve identifying credentials (Professional Certificates/Specializations) with associated skills/outcomes",
        "Both require comparing course/program offerings from partner universities (Stanford, Google, IBM etc)",
        "Tasks demand extraction of detailed metadata: instructor info, course descriptions, assessment types",
        "Both involve analyzing career-related metrics (median salaries, job availability data)",
        "Navigation tasks require interacting with categorization systems (by role, industry, technical skills)",
        "Both datasets include tasks that require distinguishing between course types (Projects/Degrees/Certificates)",
        "Tasks in both require identification of AI/tech-focused content (Machine Learning, Python, Data Science)",
        "Both involve finding free educational resources alongside paid credential programs",
        "Tasks require cross-referencing university partnerships with available degree programs"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Search functionality for courses using keywords or specific criteria",
        "Filtering options by course level (beginner, intermediate, advanced)",
        "Requirement to access detailed course information (instructors, duration, modules)",
        "Identification of professional certificates and their associated skills",
        "Navigation to degree programs with admission or credit details",
        "Analysis of user reviews and ratings for quality assessment",
        "Exploration of institutional partnerships (universities/companies)",
        "Differentiation between free and paid course offerings",
        "Verification of course prerequisites or enrollment requirements",
        "Comparison of learning outcomes across similar courses/programs"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Tasks require filtering courses by difficulty level (e.g., Beginner/Intermediate)",
        "Navigation involves searching for courses by specific technical skills (e.g., Python, Machine Learning)",
        "Users must extract numerical metadata (ratings percentages, review counts, course durations)",
        "Tasks require identification of credential types (Professional Certificates/Specializations/Degrees)",
        "Queries demand comparison of course providers (universities vs. company partnerships)",
        "Tasks involve checking availability of free courses/financial aid options",
        "Navigation requires understanding of hierarchical categorization (e.g., Business > Finance > Behavioral Finance)",
        "Users must identify instructor credentials and institutional affiliations",
        "Tasks require extraction of career outcome data (salary figures, job availability metrics)",
        "Navigation patterns involve cross-referencing course prerequisites with user skill levels"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Both datasets require navigation through hierarchical subject categories (e.g., Physics, Computer Science) with subcategory granularity",
        "Tasks in both datasets involve searching for papers using specific technical terminology or keywords",
        "Both require understanding of arXiv's paper metadata structure (title, authors, submission dates, versions)",
        "Tasks involve filtering/searching by submission date ranges or recency criteria",
        "Both datasets require interaction with category-specific abbreviations (e.g., astro-ph, quant-ph)",
        "Tasks involve locating and extracting information from paper abstracts",
        "Both require navigation between different paper formats (HTML vs. PDF)",
        "Tasks involve cross-referencing between arXiv categories and external academic platforms",
        "Both datasets require understanding of arXiv's versioning system for paper updates",
        "Tasks involve identifying and following citation/reference links within papers"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Search functionality with field selection options (Title, Author, Abstract, etc.)",
        "Advanced search filters for date ranges and categories",
        "Category-specific browsing (e.g., Physics, Computer Science, Mathematics)",
        "Subcategory links for specialized research areas (e.g., Quantum Physics, Machine Learning)",
        "Access to recent submissions via 'new' and 'recent' links in each category",
        "Structured paper metadata display (title, authors, submission dates, versions)",
        "Support for multi-format paper access (PDF, HTML, source code)",
        "Search result filters for interdisciplinary topics (e.g., AI in Quantum Computing)",
        "Author-specific search capability and publication tracking",
        "Direct links to paper sections (abstract, introduction, references)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Both datasets require searching academic papers using specific criteria like titles, authors, or keywords.",
        "Tasks involve filtering results by categories/subcategories (e.g., Computer Science, Quantum Physics).",
        "Users must navigate date-based queries to retrieve recent or time-bound submissions (e.g., last 2 days, December 2023).",
        "Both require accessing paper metadata such as submission dates, version history, and author lists.",
        "Tasks involve locating and interpreting specialized sections of papers (abstracts, methodology, results).",
        "Users interact with advanced search features (field selection, date ranges, category-specific queries).",
        "Both datasets include actions to download/view papers in multiple formats (PDF, HTML).",
        "Tasks require handling arXiv-specific identifiers (e.g., arXiv IDs, category codes like cs.CL).",
        "Users navigate hierarchical subject structures (main categories \u2192 subfields \u2192 recent lists).",
        "Both involve cross-referencing papers with external criteria (conference acceptances, citation counts)."
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Both datasets require navigation through hierarchical subject categories with detailed subfield classifications (e.g., Physics \u2192 Solar and Stellar Astrophysics).",
        "Tasks in both datasets involve searching for papers using keyword filters like title, abstract, or author names.",
        "Users must locate and interpret submission dates, version histories, and announcement timelines for papers.",
        "Both require interaction with search result filtering by date ranges (e.g., last two days, specific months).",
        "Tasks necessitate understanding arXiv's category-specific abbreviations (e.g., quant-ph for Quantum Physics, cs.CL for Computation and Language).",
        "Both involve accessing full-text paper formats (HTML/PDF) to extract specific sections like abstracts or technical details.",
        "Users must differentiate between arXiv's main categories (e.g., Computer Science vs. Statistics) and their nested sub-archives.",
        "Tasks require parsing metadata such as author counts, affiliation mentions, or conference acceptance references in papers.",
        "Both datasets involve cross-referencing papers with external criteria (e.g., AAAI 2024 acceptance, university affiliations).",
        "Navigation tasks in both rely on arXiv's persistent structural elements: search bars, category lists, and help/documentation links."
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Tasks require searching by specific research topics or keywords (e.g., 'quantum computing', 'machine learning')",
        "Navigation involves accessing category-specific archives (e.g., Computer Science, Quantum Physics)",
        "Tasks frequently require identifying paper metadata like submission dates and versions",
        "Users need to interact with 'new'/'recent' article lists within subcategories",
        "Tasks involve cross-referencing author names with paper content",
        "Both datasets require handling advanced search parameters like date ranges and field filters",
        "Tasks demand retrieval of technical details (abstracts, figures, tables) from papers",
        "Navigation includes accessing multiple paper formats (PDF, HTML, TeX source)",
        "Tasks involve category hierarchy understanding (e.g., Physics subfields, CS subdisciplines)",
        "Both require checking cross-category search differences (specific archive vs all archives)"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Tasks require navigating hierarchical content categories (e.g., News > Technology > AI)",
        "Users must identify time-sensitive information through publication timestamps",
        "Article summaries require extraction of geographic/regional context from content",
        "Multi-step navigation needed to locate specialized sections (e.g., BBC Verify/InDepth)",
        "Content retrieval requires understanding of topical tagging systems (e.g., war-related classification)",
        "Tasks involve cross-referencing visual elements (images) with textual content",
        "Users must parse compound article metadata (category + timestamp + location)",
        "Navigation patterns require understanding of persistent section architecture",
        "Tasks demand distinction between main stories vs. secondary features/analysis",
        "Content discovery relies on consistent temporal ordering of articles"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks require navigating to specific content categories (e.g., 'War', 'Technology', 'Sports') via structured menu hierarchies.",
        "Users must locate and extract time-sensitive information (e.g., article timestamps like '2 hrs ago' or event schedules).",
        "Tasks involve identifying multimedia elements (e.g., images, videos) within articles or sections.",
        "Navigation includes accessing region-specific news sections (e.g., 'Asia', 'Middle East', 'Europe').",
        "Users must parse article metadata (e.g., publication dates, categories, authors) to answer queries.",
        "Tasks require summarizing key points from articles with multi-paragraph content and embedded subheadings.",
        "Users interact with dynamic content like live updates (e.g., 'LIVE' coverage banners, real-time sports scores).",
        "Navigation paths involve drilling into nested lists (e.g., 'MOST READ', 'MOST WATCHED', 'OTHER TOP STORIES').",
        "Tasks demand filtering content by thematic tags (e.g., 'Business', 'Climate', 'Culture') for targeted information retrieval.",
        "Users must cross-reference hierarchical sections (e.g., 'Sports > Football > Premier League') to locate granular details."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Structured navigation through categorized sections (e.g., News, Sport, Business, Technology)",
        "Requirement to locate and summarize time-sensitive articles with visible publication timestamps",
        "Presence of multimedia elements (images, videos) within article previews for content identification",
        "Dedicated regional news subsections (Middle East, Europe, US & Canada) for geographic filtering",
        "Real-time/live content updates marked with temporal indicators (e.g., '38 mins ago', 'LIVE')",
        "Hierarchical organization of content through nested lists and submenus for topic drilling",
        "Prominent 'Most Read/Watched' sections highlighting popular content",
        "Consistent article metadata patterns (category tags, timestamps, author references)",
        "Multi-step navigation required for specialized content (e.g., climate guides, war updates)",
        "Persistent search functionality implied through task requirements to find specific topics"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Tasks require navigating through categorised sections (e.g., World, US & Canada, Business)",
        "Users must locate time-sensitive information via publication timestamps (e.g., 'hrs ago' labels)",
        "Multi-step navigation required to find articles with specific geographic/category tags (e.g., Asia, Technology)",
        "Need to parse hierarchical menu structures to access subsections (e.g., Sports > Football)",
        "Tasks involve identifying and extracting key details from article previews (headlines, summaries)",
        "Requires differentiation between live updates vs archived content through UI cues",
        "Necessitates cross-referencing content types (text articles vs embedded videos/images)",
        "Users must locate author/source attribution patterns in article metadata",
        "Tasks depend on understanding BBC's regional categorization structure (e.g., Middle East vs Asia sections)",
        "Requires identification of recurring content formats (e.g., 'In Pictures', 'Most Watched' blocks)"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks require navigating hierarchical content categories (e.g., Sport, Business, Technology) to locate specific articles",
        "Users must identify and extract timestamps (e.g., 'hrs ago') from news articles",
        "Instructions involve locating multimedia elements (images/videos) within article content",
        "Tasks require distinguishing between regional news sections (Middle East, Asia, Europe)",
        "Users must parse article metadata including section labels (e.g., 'BBC InDepth') and geographic tags",
        "Navigation involves identifying and using 'Most Read/Most Watched' recommendation sections",
        "Tasks require differentiating between live updates versus archived news content",
        "Users must locate specialized content categories (e.g., 'Innovation', 'Future Planet') for specific queries",
        "Instructions involve cross-referencing multiple articles within thematic clusters (e.g., war coverage)",
        "Tasks require identifying and following internal content links between related stories"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Both datasets involve product searches requiring specific attributes like price, size, color, and ratings.",
        "Tasks in both datasets require filtering results by price ranges (e.g., under $50, between $100\u2013$200).",
        "Users in both datasets prioritize items with high customer ratings (e.g., 4+ stars) or best-selling status.",
        "Navigation tasks frequently involve adding items to cart or preparing orders for purchase.",
        "Both datasets include tasks to compare prices or features across multiple products.",
        "Tasks require navigating category-specific sections (e.g., electronics, home decor, fashion, baby products).",
        "Users in both datasets utilize search functionality to locate niche or specific products (e.g., \"USB-C hub with HDMI\" or \"pre-owned Louis Vuitton handbag\").",
        "Tasks often involve validating product specifications (e.g., compatibility with devices, material, or dimensions).",
        "Both datasets include tasks to identify deals, discounts, or sale items within constrained budgets.",
        "Tasks require verifying logistical details like delivery policies, return eligibility, or shipping options."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks involve product search with specific attributes (price, size, color, ratings)",
        "Requires interaction with e-commerce filtering/sorting mechanisms",
        "Contains actions to add items to cart or check availability",
        "Includes price comparison across multiple products",
        "Focuses on category-specific navigation (electronics, home & kitchen, fashion)",
        "Requires understanding of product specifications and variants",
        "Contains tasks requiring account interactions (cart management, sign-in)",
        "Involves evaluation of customer reviews/ratings in decision making",
        "Includes time-sensitive operations (new releases, sales items)",
        "Requires navigation through hierarchical product taxonomies"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks require precise product search with multiple filters (price, rating, size, color).",
        "Navigation involves structured categories (e.g., electronics, fashion, home essentials).",
        "Users must validate product details (reviews, availability, shipping policies).",
        "Tasks involve adding items to cart or saving results for later action.",
        "Focus on price comparison and identifying discounts/sale items.",
        "Requires interaction with search result sorting (e.g., 'Best Sellers', 'Newest Arrivals').",
        "Tasks demand navigation of Amazon-specific features (Prime benefits, Kindle Store, Fresh Grocery).",
        "Users must parse product specifications (material, dimensions, compatibility).",
        "Error recovery scenarios (e.g., CAPTCHA verification, item unavailability) are implicit in workflows.",
        "Time-sensitive actions (e.g., upcoming releases, limited-time deals) are common."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks involve filtering products by price range and specific attributes (e.g., size, color, material).",
        "Users frequently search for items with minimum star ratings (e.g., 4+ stars).",
        "Navigation includes adding items to cart as a common action.",
        "Tasks require verifying product availability (e.g., stock status, color/size options).",
        "Users compare prices across multiple search results or product variants.",
        "Tasks involve checking return/delivery policies for specific items.",
        "Navigation includes sorting results (e.g., by price, customer reviews, or release date).",
        "Users target discounted/sale items with explicit budget constraints.",
        "Tasks focus on category-specific browsing (e.g., electronics, home goods, fashion).",
        "Navigation requires identifying brand-specific products (e.g., Samsung, Apple, Louis Vuitton)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Tasks involve structured product searches with multiple filters (price, ratings, features)",
        "Require interaction with cart functionality (e.g., 'add to cart')",
        "Include price constraints as primary search criteria",
        "Focus on category-specific navigation (e.g., electronics, fashion, home goods)",
        "Demand sorting/filtering by star ratings (typically 4+ stars)",
        "Involve verification of shipping/return policies",
        "Require checking product availability (colors, sizes, stock status)",
        "Include brand-specific queries (e.g., Samsung, Wahl, Belkin)",
        "Contain time-sensitive requirements (sales, new releases)",
        "Emphasize user review analysis as decision factor"
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Tasks require computational problem-solving across mathematics, physics, and engineering domains.",
        "Queries involve unit conversions (e.g., weight, time, currency, energy).",
        "Tasks demand structured scientific data retrieval (e.g., material properties, chemical equations).",
        "Requests for step-by-step solutions to equations or proofs.",
        "Natural language inputs are used to describe complex technical or mathematical scenarios.",
        "Focus on real-world applications (e.g., health metrics, financial calculations, climate data).",
        "Comparative analysis of entities (e.g., food items, countries, materials).",
        "Inquiries about statistical distributions, series convergence, or probability metrics.",
        "Exploration of dynamic systems (e.g., pendulums, differential equations).",
        "Tasks leverage domain-specific terminology (e.g., Lagrangian, molar mass, convergence tests)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Tasks require mathematical computations and problem-solving (e.g., equations, derivatives, integrals).",
        "Queries involve scientific concepts across domains like physics, chemistry, and engineering.",
        "Navigation includes unit conversions (e.g., mass to molar, currency, distance).",
        "Tasks demand data analysis (e.g., statistical measures, trends, comparisons).",
        "Users seek step-by-step solutions for algebraic or calculus problems.",
        "Queries target real-world applications (e.g., calorie calculation, mortgage payments).",
        "Tasks explore Wolfram Alpha's computational capabilities (e.g., NLP input, visualizations).",
        "Navigation focuses on structured academic topics (e.g., Riemann Hypothesis, beta distribution).",
        "Users request comparisons (e.g., food calories, language characteristics, GDP data).",
        "Tasks involve domain-specific terminology (e.g., stoichiometry, Buckingham pi theorem, orbital periods)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Tasks involve mathematical computations (e.g., solving equations, derivatives, integrals, polynomial simplification).",
        "Tasks require scientific or engineering domain knowledge (e.g., physics, chemistry, thermodynamics, material properties).",
        "Tasks involve data analysis or interpretation (e.g., averages, statistical distributions, temperature anomalies).",
        "Tasks include unit conversions or unit-aware calculations (e.g., metric/imperial, time, energy, currency).",
        "Tasks relate to nutritional, health, or fitness calculations (e.g., calorie intake, exercise metrics, health data).",
        "Tasks demand step-by-step problem decomposition (e.g., partial fraction decomposition, limit evaluation, equation solving).",
        "Tasks explore mathematical concepts like sequences, series, and paradoxes (e.g., Fibonacci, Collatz conjecture, Russell's paradox).",
        "Tasks require real-world application of abstract concepts (e.g., financial planning, climate models, engineering scenarios).",
        "Tasks involve natural sciences queries (e.g., chemical structures, astronomical events, material conductivity).",
        "Tasks include financial/economic calculations (e.g., inflation rates, bond values, salary comparisons)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Tasks require computational problem-solving across mathematics, physics, and engineering domains.",
        "Queries involve unit conversions (e.g., currency, time, weight, energy).",
        "Navigation includes accessing specialized tools for step-by-step solutions or equation solving.",
        "Tasks demand data retrieval from structured knowledge bases (e.g., chemical properties, historical events).",
        "Interactions involve natural language input for complex calculations (e.g., derivatives, integrals).",
        "Users seek real-world applications (e.g., finance, health metrics, climate models).",
        "Tasks require comparisons between entities (e.g., food items, financial investments).",
        "Navigation targets domain-specific examples (e.g., algebra, quantum physics, economics).",
        "Queries utilize Wolfram Alpha's curated datasets (e.g., stock prices, astronomical data).",
        "Tasks involve parameterized inputs with multiple variables (e.g., mass, velocity, time constraints)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Tasks require input of mathematical equations or scientific formulas for computation.",
        "Queries involve unit conversions or physical measurements (e.g., weight, time, energy).",
        "Tasks demand computational answers for real-world scenarios (e.g., finance, health, engineering).",
        "User intents include solving differential equations or analyzing mathematical series convergence.",
        "Queries involve chemical, physical, or material properties (e.g., elements, compounds).",
        "Tasks require data retrieval from curated knowledge bases (e.g., GDP, historical climate data).",
        "User goals focus on step-by-step solutions or visualizations (e.g., plots, graphs).",
        "Queries include comparisons between entities (e.g., food items, planetary metrics).",
        "Tasks involve statistical calculations (e.g., averages, variances, distributions).",
        "User intents explore computational features of Wolfram Alpha (e.g., syntax, formatting outputs)."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Search functionality allows filtering by recipe name, ingredients, or keywords",
        "Recipes include user-generated ratings and review counts as key metrics",
        "Advanced filtering options for dietary preferences (vegetarian, vegan, gluten-free)",
        "Time-based filtering available for prep/cook time durations",
        "Recipe cards display prominent save/bookmark functionality",
        "Multi-category navigation structure (Dinners/Meals/Ingredients/Occasions)",
        "Popular recipes sections highlight high-review-count (>1000) dishes",
        "Featured content sections for seasonal/trending recipe collections",
        "Recipe detail pages include ingredient lists and step-by-step instructions",
        "User account features for saving preferences and recipe collections"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks involve searching for recipes using specific criteria like ingredients, ratings, and preparation time",
        "Users need to filter results by dietary preferences (e.g., vegetarian, vegan, keto)",
        "Tasks require identifying recipes with minimum user ratings (e.g., 4 stars or higher)",
        "Users must locate recipes with a minimum number of reviews (e.g., 100+ reviews)",
        "Navigation includes filtering by meal type (e.g., dinner, dessert, holiday-specific recipes)",
        "Tasks involve accessing recipe details like ingredients, steps, and cooking time",
        "Users are prompted to compare multiple recipes or categories (e.g., BBQ ribs, casseroles)",
        "Tasks require saving/bookmarking recipes for later reference",
        "Users need to parse nutritional information or calorie counts for meal planning",
        "Navigation includes browsing sections like 'Popular Recipes,' 'Trending Now,' or holiday-themed content"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Tasks require filtering recipes by user ratings (e.g., 4+ stars).",
        "Tasks involve searching for recipes with specific ingredient constraints (e.g., chicken, quinoa).",
        "Tasks require parsing recipe metadata like prep/cook time and calorie counts.",
        "Tasks demand identifying recipes with minimum review thresholds (e.g., 500+ reviews).",
        "Tasks involve navigating category-based sections (e.g., Dinners, Desserts, Holidays).",
        "Tasks require extracting user-generated content like reviews or ratings.",
        "Tasks involve recipe comparison (e.g., nutritional data, ingredient lists).",
        "Tasks target dietary-specific recipes (e.g., vegetarian, vegan, gluten-free).",
        "Tasks require saving/bookmarking recipes for later access.",
        "Tasks involve browsing curated recipe collections (e.g., 'Trending Now', seasonal themes)."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Both datasets require users to search for recipes using specific filters such as ingredients, dietary preferences, and cooking time.",
        "Users in both datasets navigate through categorized sections like 'Dinners', 'Meals', and 'Ingredients' to locate recipes.",
        "Tasks involve filtering recipes by user ratings (e.g., 4 stars or higher) and review counts across both datasets.",
        "Both include recipe pages with detailed metadata: ingredients, preparation steps, cook time, and nutritional information.",
        "Users interact with community features, such as saving recipes, leaving reviews, and viewing home cook profiles.",
        "Navigation tasks require browsing seasonal/holiday-specific categories (e.g., Easter, Christmas) in both datasets.",
        "Both datasets emphasize trending or popular recipes, often highlighted in sections like 'Trending Now' or 'Popular Recipes'.",
        "Users utilize search bars with autocomplete suggestions (e.g., 'Popular Searches' like 'Chicken' or 'Banana Bread').",
        "Both include editorial content such as 'Kitchen Tips', 'News', and featured articles from culinary experts.",
        "Tasks involve accessing user-generated content like reviews, ratings, and recipe modifications in both datasets."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks require searching for recipes using specific filters like ratings, ingredients, or preparation time",
        "Users must locate recipes categorized by dietary preferences (vegetarian, vegan, gluten-free)",
        "Navigation involves accessing hierarchical recipe categories (e.g., dinners/meals/ingredients)",
        "Tasks require interaction with user reviews/ratings systems for recipe evaluation",
        "Users must parse detailed recipe metadata including serving sizes and nutritional requirements",
        "Tasks involve finding seasonal/holiday-specific content (Easter, Christmas, Thanksgiving)",
        "Navigation requires using search functionality with multi-word queries and ingredient combinations",
        "Users must distinguish between editorial content and community-generated recipes",
        "Tasks involve comparing recipe versions through sorting/filtering by popularity or cook time",
        "Platform features recipe saving/bookmarking functionality for later access"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Both datasets require users to search for word definitions using a centralized search bar with language selection options.",
        "Tasks in both datasets involve accessing pronunciation guides available in both UK and US English formats.",
        "Users in both datasets are required to navigate through multiple dictionary sections (e.g., Learner\u2019s Dictionary, Essential British/American English).",
        "Both datasets include tasks that demand interaction with example sentences and grammatical usage notes for queried words.",
        "Translation functionalities (e.g., English\u2013French, English\u2013Spanish) are integral to tasks in both datasets.",
        "Grammar exploration tasks (e.g., modal verbs, comparative adjectives) are structured similarly across both datasets.",
        "Both datasets feature interactive elements like the Word Scramble game in the '+Plus' section for user engagement.",
        "Users must parse multi-part entries (e.g., word senses, phrasal verbs) to identify specific meanings in both datasets.",
        "Tasks in both datasets require browsing alphabetical index links (A-Z) to explore dictionary contents.",
        "Both datasets involve cookie consent management as part of initial page interactions."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Both datasets require users to look up word definitions using a search function.",
        "Tasks in both datasets involve retrieving pronunciation guides (UK/US variants).",
        "Users must locate example sentences demonstrating word usage in context in both datasets.",
        "Both include translation tasks between English and other languages (e.g., Chinese, French, Spanish).",
        "Grammar-related navigation is present in both (e.g., modal verbs, adjective rules, verb tenses).",
        "Tasks require identification of multiple meanings/definitions for polysemous words in both datasets.",
        "Both involve finding synonyms/thesaurus entries for given terms.",
        "Navigation to specialized dictionary sections (Learner's Dictionary, Essential variants) is required in both.",
        "Tasks in both datasets demand interaction with phonetic transcription displays.",
        "Both require users to distinguish between lexical categories (noun/verb meanings) in definitions"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks require searching for words to retrieve definitions, pronunciations, and example sentences",
        "Both datasets involve accessing UK and US English pronunciation guides for words",
        "Users must navigate to grammar sections to find rules and usage examples (e.g., modal verbs, comparative adjectives)",
        "Translation functionality between English and multiple languages (e.g., Chinese, French) is utilized",
        "Tasks require identifying multiple meanings/definitions listed for single entries",
        "Example sentences are consistently available to demonstrate word usage in context",
        "Thesaurus integration enables synonym lookup tasks",
        "Word of the Day feature with pronunciation and usage examples is present in both",
        "Alphabetical browsing (A-Z index) is required for some exploration tasks",
        "Plus section with word games (e.g., Word Scramble) appears in navigation flows"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Both datasets require users to look up word definitions with detailed explanations.",
        "Tasks in both datasets involve retrieving pronunciation guides for words, including UK and US variants.",
        "Example sentences are a key component in tasks across both datasets to illustrate word usage.",
        "Both datasets include translation tasks between English and other languages (e.g., Chinese, French, Spanish).",
        "Grammar-related navigation tasks (e.g., modal verbs, comparative adjectives) are present in both datasets.",
        "Synonyms and related terms are frequently requested in tasks from both datasets (e.g., via Thesaurus).",
        "Navigation to specialized sections (e.g., Shop, Word Scramble game) is required in tasks from both datasets.",
        "Tasks in both datasets require browsing alphabetical listings (A\u2013Z) or category-based navigation.",
        "Both datasets involve social media sharing features (e.g., sharing definitions on Twitter or Facebook).",
        "Multi-step exploration (e.g., identifying multiple meanings or translations for a single word) is common to both datasets."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Tasks require dictionary lookups for word definitions, pronunciations, and example sentences in both datasets.",
        "Navigation tasks involve accessing UK and US English pronunciation guides for words in both datasets.",
        "Users must interact with grammar sections to find rules and usage examples in both datasets.",
        "Tasks include translating words/phrases between languages using dictionary translation tools in both datasets.",
        "Both datasets require users to compare multiple meanings or translations of words.",
        "Tasks involve exploring synonyms, antonyms, or related terms using thesaurus features in both datasets.",
        "Navigation includes accessing vocabulary lists, collocations, or usage notes for words in both datasets.",
        "Users must locate and interact with example sentences to understand word context in both datasets.",
        "Tasks require browsing alphabetical indexes or category-based word lists (e.g., A-Z) in both datasets.",
        "Both datasets include tasks that combine dictionary lookup with cross-referencing grammar/usage guides."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Tasks require locating technical specifications (dimensions, weight, storage) for Apple products",
        "Users need to compare pricing between different models/variants of devices",
        "Navigation involves checking product release dates and regional availability",
        "Tasks require identifying compatibility information between devices/accessories",
        "Users must locate accessory details (types, compatibility, pricing)",
        "Tasks involve finding support documentation for account/password issues",
        "Navigation requires comparing features across product generations/models",
        "Users need to identify environmental/sustainability initiatives in product lines",
        "Tasks require exploring business/enterprise solutions offered by Apple",
        "Navigation involves checking privacy/security features and data usage policies"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks require locating detailed product specifications (storage, dimensions, technical features)",
        "Navigation involves comparing multiple device models or product versions",
        "Requires price checking across different configurations or models",
        "Involves finding release dates and regional availability information",
        "Tasks require accessing support documentation for troubleshooting",
        "Navigation paths include store locator features and inventory checks",
        "Requires understanding product customization options and accessories",
        "Tasks involve subscription/service details (AppleCare+, Fitness+, etc.)",
        "Requires cross-referencing multiple website sections (product pages vs support)",
        "Navigation patterns include trade-in value calculations and financing options"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks focus on product specification retrieval (e.g., weight, storage, technical features)",
        "Navigation requires price comparison between models or configurations",
        "Tasks involve identifying release dates and regional availability information",
        "Users must locate accessory compatibility/availability (e.g., cases, keyboards, Apple Pencil)",
        "Requires comparison of technical features between product generations/models",
        "Tasks demand configuration customization analysis (storage, connectivity, engraving options)",
        "Navigation involves checking in-store pickup availability near specific locations",
        "Requires understanding of educational/business-specific pricing and discounts",
        "Tasks involve cross-referencing product features with environmental/sustainability reports",
        "Navigation patterns require accessing support resources (password recovery, setup guides)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Tasks require navigating product detail pages for specifications like storage, weight, and technical features",
        "Both involve comparing prices/configurations across multiple device models (iPhone, iPad, MacBook)",
        "Tasks require accessing Apple's online store inventory check functionality for pickup availability",
        "Both datasets contain queries about product release dates and regional availability information",
        "Tasks involve locating technical support resources (password recovery, repair costs, setup guides)",
        "Both require interacting with product customization interfaces (storage options, accessory selection)",
        "Queries demand understanding Apple's product hierarchy (differentiating between AirPods models, Watch editions)",
        "Tasks require parsing promotional content for device-specific marketing claims/slogans",
        "Both involve financial transactions aspects (trade-in values, Apple Card integration, financing options)",
        "Tasks necessitate navigating between main product pages and specialized sections (environmental reports, enterprise solutions)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks require identifying product specifications (storage, weight, dimensions, technical features)",
        "Navigation involves comparing prices between different device models or configurations",
        "Users need to locate device customization options (storage upgrades, accessories, engraving)",
        "Tasks frequently involve troubleshooting/support queries (password recovery, warranty checks, repairs)",
        "Requires checking product availability by location (store pickup, zip code-based inventory)",
        "Focus on release date information and regional availability for new products",
        "Tasks involve calculating total costs (base price vs. upgraded configurations, trade-in values)",
        "Requires comparing technical specifications between current and previous device generations",
        "Navigation patterns include accessing specialized sections (business solutions, education pricing)",
        "Tasks demand cross-referencing compatibility information (device-OS compatibility, accessory compatibility)"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks primarily involve information retrieval through search queries",
        "Users frequently seek specific, factual data (e.g., statistics, dates, definitions)",
        "Multi-step navigation required to parse results or access sub-pages",
        "Common need to compare/contrast entities (e.g., products, teams, concepts)",
        "Requires interaction with both textual and structured data presentations",
        "Emphasis on current/recent information (news, scores, market data)",
        "Frequent use of filtering by location/time/domain expertise",
        "Tasks often involve cross-referencing multiple sources or result types",
        "Common pattern of specification refinement (e.g., adding qualifiers to searches)",
        "Requires understanding hierarchical information architectures (menus, categories)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Tasks require information retrieval through search engines as a primary action",
        "Queries involve both simple fact-finding and complex, multi-step research objectives",
        "Navigation patterns include form interactions (e.g., login attempts, recipe filters)",
        "Tasks frequently target specific content types: news articles, videos, academic papers, and product listings",
        "Many objectives require parsing and synthesizing information from multiple page elements/results",
        "Common need for temporal awareness (latest results, current statistics, upcoming events)",
        "Tasks often involve platform-specific features: maps, shopping, specialized search filters",
        "User goals frequently include comparative analysis (stock prices, recipe ingredients, product features)",
        "Multiple tasks require credibility assessment of sources and results",
        "Navigation flows demonstrate iterative refinement of search queries and filters"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Tasks involve using search engines to locate specific factual information (e.g., statistics, dates, definitions).",
        "Tasks require navigating to retrieve current or real-time data (e.g., air quality, stock performance, live scores).",
        "Tasks include accessing multimedia content (e.g., videos, trailers, images) for details like comments, titles, or metadata.",
        "Tasks demand interaction with structured web forms (e.g., login attempts, job applications, ticket bookings).",
        "Tasks involve structured data extraction from lists, tables, or search results (e.g., rankings, event details, research paper titles).",
        "Tasks focus on multi-step navigation to cross-reference information across platforms (e.g., news, research papers, social media).",
        "Tasks target domain-specific queries (e.g., technology, sports, health, entertainment) with precise keyword usage.",
        "Tasks require parsing academic, technical, or professional content (e.g., research papers, program requirements, corporate updates).",
        "Tasks involve transactional actions (e.g., purchasing items, reserving event spaces, subscribing to newsletters).",
        "Tasks necessitate verification of outcomes (e.g., login success, accuracy of retrieved data, confirmation of application submissions)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Tasks involve retrieving specific factual information (e.g., dates, statistics, prices, bios).",
        "Queries require current or real-time data (e.g., air quality, stock prices, latest news).",
        "Navigation includes multi-step actions (e.g., search, filter, then extract details).",
        "Tasks often target structured data extraction (e.g., lists, rankings, comparative metrics).",
        "Use of advanced search operators or filters (e.g., date ranges, location-based results).",
        "Interaction with forms or authentication flows (e.g., logins, bookings, reservations).",
        "Focus on product/service specifications (e.g., device requirements, hotel ratings).",
        "Dependence on third-party platforms integrated with search (e.g., YouTube, Gmail, Google Store).",
        "Need to parse dynamic or frequently updated content (e.g., trending topics, live scores).",
        "Emphasis on verifying source credibility (e.g., academic papers, official news outlets)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Tasks require precise information retrieval using search engines",
        "Navigation involves interacting with links, buttons, and forms",
        "Users must locate real-time or time-sensitive data (e.g., scores, news, stock prices)",
        "Tasks demand multi-step interactions (e.g., search \u2192 filter \u2192 extract)",
        "Queries target specific entities (people, places, products, events)",
        "Actions include parsing structured content (tables, lists, menus)",
        "Tasks involve verifying credentials or account-based interactions",
        "Users must compare results (prices, features, rankings)",
        "Navigation requires accessing third-party platforms via search results",
        "Tasks prioritize extracting numerical data (dates, metrics, counts)"
      ]
    }
  },
  "diffs_synth_from_real": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Dataset B tasks explicitly require booking/reservation actions (e.g., hotel rooms, restaurant tables) while A focuses on information retrieval without transactional requirements",
        "Dataset B contains queries for specific menu items (e.g., 'Crispy Chicken Sandwich') while A focuses on establishment attributes rather than product-level details",
        "Dataset B includes multi-stop route planning with leisure activities (e.g., 'stopping at coffee shop') while A focuses on direct point-to-point navigation",
        "Dataset B tasks require price checking for specific services/products (e.g., Climate LLC's Premium plan) while A focuses on location-based pricing attributes",
        "Dataset B contains time-specific event planning (e.g., New Year's Day reservations) while A focuses on recurring temporal constraints (e.g., daytime-only parking)",
        "Dataset B includes explicit dietary requirement filters (e.g., gluten-free meals) while A's dietary constraints are implied through general accessibility features",
        "Dataset B tasks require verification of specific amenity combinations (e.g., 'playground with slides') while A focuses on singular amenity types",
        "Dataset B contains comparative analysis of product features (e.g., playground equipment) while A focuses on comparing location attributes",
        "Dataset B includes explicit date-shifting requirements (e.g., 'available one day earlier') while A focuses on current/persistent availability",
        "Dataset B tasks target review analysis for specific menu items/services (e.g., sandwich quality) while A focuses on general review sentiment analysis"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset A tasks focus on specific business services (e.g., parking types, beauty salons), while B includes non-business entities (e.g., parks, trails, medical conditions).",
        "Dataset B tasks explicitly involve booking/reservation actions (e.g., hotel stays, restaurant tables), whereas A does not.",
        "Dataset B queries frequently include budget constraints (e.g., '$400', 'best price'), while A does not mention pricing.",
        "Dataset A tasks require summarizing user sentiment (e.g., 'summarize what people are saying'), whereas B focuses on retrieving factual reviews.",
        "Dataset B includes accessibility queries beyond navigation (e.g., wheelchair-accessible museums with parking details), while A focuses on accessible routes/parking types.",
        "Dataset A emphasizes quantitative result counts (e.g., 'how many results', 'list three'), while B prioritizes qualitative filters (e.g., 'highly rated', 'moderately priced').",
        "Dataset B tasks contain explicit temporal requirements for stays (e.g., '2-night stay', 'starting from January 10'), while A focuses on immediate operating hours.",
        "Dataset A requires multi-step validation of service attributes (e.g., 'not open 24 hours'), whereas B emphasizes single-step filtering (e.g., 'open now').",
        "Dataset B includes non-location-based product searches (e.g., 'wheelchair products'), while A remains strictly location-constrained.",
        "Dataset A uses zip codes as primary location anchors, while B relies more on neighborhood names (e.g., 'Le Marais', 'Williamsburg')."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset B tasks more frequently involve multi-city or international locations (e.g., Paris/Tokyo/Yellowstone) while A focuses on single-city queries",
        "Dataset B contains explicit trip planning elements (e.g., 'plan a visit', 'plan a hike') absent in A's immediate navigation needs",
        "Dataset B includes specific date-based requirements (e.g., 'January 11th availability') while A uses real-time constraints",
        "Dataset B tasks require hotel/restaurant reservations and ticket purchases unlike A's information retrieval focus",
        "Dataset B features explicit budget ranges (e.g., '$350 per night') while A uses relative price descriptors",
        "Dataset B contains complex multi-destination itineraries (3+ stops) compared to A's simpler point-to-point routes",
        "Dataset B includes property transactions (e.g., 'properties for sale') absent in A's service location tasks",
        "Dataset B requires guest capacity specifications (e.g., '3 people') not present in A's queries",
        "Dataset B involves vacation planning elements (e.g., '4-6 day stay') while A focuses on immediate use cases",
        "Dataset B contains business management tasks (e.g., 'check reviews for my business') absent in A's consumer-focused queries"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Dataset B tasks involve booking or reservation actions (e.g., hotel bookings with dates/guests), while Dataset A focuses purely on location discovery without reservations.",
        "Dataset B includes tasks requiring date-specific constraints (e.g., check-in/check-out dates, flight dates), whereas Dataset A tasks use real-time or non-temporal filters (e.g., 'open now').",
        "Dataset B tasks explicitly reference travel planning across cities (e.g., 'from San Francisco to Palo Alto'), while Dataset A focuses on navigation within or between fixed locations.",
        "Dataset B incorporates tasks that require interaction with media content (e.g., '360\u00b0 and Street View images', 'photo gallery'), which are absent in Dataset A.",
        "Dataset B tasks frequently involve price comparisons and budget constraints (e.g., '$160 budget'), while Dataset A uses broader price ranges (e.g., '$$') without explicit monetary thresholds.",
        "Dataset B includes tasks requiring itinerary-based planning (e.g., multi-day stays, flight dates), whereas Dataset A focuses on immediate or single-step actions.",
        "Dataset B tasks involve explicit user review creation (e.g., 'be prepared to write a review'), while Dataset A only requires reading existing reviews.",
        "Dataset B tasks reference amenities tied to travel logistics (e.g., 'free cancellation', 'breakfast options'), whereas Dataset A emphasizes static amenities (e.g., 'EV charging', 'wheelchair accessibility').",
        "Dataset B includes tasks with multi-city or international route planning (e.g., 'Paris to Stanford'), while Dataset A focuses on local or intra-city routes.",
        "Dataset B tasks require historical/environmental research (e.g., 'environmental threats faced by regions'), which are absent in Dataset A."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Dataset B tasks require explicit reservation/bookings (date-specific hotel rooms/restaurant tables) while Dataset A focuses on discovery without reservations",
        "Dataset B includes itinerary planning with multiple stops (e.g. 'suggest bike-friendly restaurants along the way') whereas Dataset A focuses on single-destination routing",
        "Dataset B tasks request price comparisons/optimization ('best price', 'cheapest French brunch') while Dataset A emphasizes price range identification",
        "Dataset B requires interaction with temporal event-based queries ('New Year's Eve', 'brunch on January 1') unlike Dataset A's present-time focus",
        "Dataset B contains tasks about viewing/analyzing photos of locations while Dataset A never references visual content inspection",
        "Dataset B includes purchase transactions (ticket buying) absent from Dataset A's information-gathering focus",
        "Dataset B tasks reference elevation profiles/trail difficulty ('hiking trail elevation') unlike Dataset A's flat route planning",
        "Dataset B requires cancellation policy checks ('free cancellation option') while Dataset A focuses on permanent accessibility features",
        "Dataset B contains conceptual/thematic queries ('effect of climate change on glaciers') unlike Dataset A's concrete location needs",
        "Dataset B tasks involve tourism planning (day trips, attraction visits) while Dataset A emphasizes local service discovery"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in B require explicit identification of pricing details for specific GitHub products (e.g., Copilot Pro, Codespaces)",
        "B tasks demand direct interaction with compliance documentation (e.g., terms of service, privacy policy updates)",
        "B includes tasks requiring understanding of vulnerability reporting processes and advisory lookup mechanisms",
        "Tasks in B focus on educational resource enrollment rather than just discovery (e.g., signing up for courses)",
        "B requires comparison of AI tool capabilities across plans (e.g., Copilot's commit message generation)",
        "Tasks in B involve explicit exploration of incident history and service status reporting",
        "B tasks require navigation through security certification documentation for specific products",
        "Tasks in B demand identification of intellectual property implications for AI-generated code",
        "B includes tasks requiring interaction with mobile-specific functionality and app features",
        "Tasks in B focus on marketplace tool discovery (e.g., Hyperlint AI) rather than general repository search"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Dataset B tasks require identifying security advisory severity levels (e.g., high-severity) while A focuses on general security feature location",
        "Dataset B includes compliance documentation tasks (e.g., CSA STAR Certificate) not present in A",
        "Dataset B tasks involve API technical comparisons (REST vs GraphQL) while A focuses on API documentation navigation",
        "Dataset B contains specific pricing quote requests (e.g., Advanced Security) while A focuses on plan feature comparisons",
        "Dataset B tasks require understanding data processing policies (Copilot training data usage) not emphasized in A",
        "Dataset B includes GitHub Actions workflow implementation tasks (finding specific actions) absent in A",
        "Dataset B contains job search-related tasks (GitHub career openings) not found in A",
        "Dataset B requires pull request-issue linkage tasks while A focuses on general repository metadata",
        "Dataset B includes AI ethics tasks (responsible Copilot use) not present in A",
        "Dataset B contains localization tasks (page translation) while A focuses on English-only navigation"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Dataset B tasks require understanding GitHub Copilot's security features and data protection compliance.",
        "Dataset B tasks involve investigating GitHub's data usage policies and privacy practices for business/enterprise users.",
        "Dataset B tasks focus on educational resources and academic use cases (e.g. GitHub Classroom, Educators projects).",
        "Dataset B tasks require troubleshooting specific technical implementations (e.g. Metrics embed GitHub Action errors).",
        "Dataset B tasks emphasize comparing/understanding GitHub Copilot extensions and integrations.",
        "Dataset B tasks involve explicit account creation requirements for accessing specific features like Copilot.",
        "Dataset B tasks require deeper analysis of pricing structures and upgrade paths between service tiers.",
        "Dataset B tasks focus on documentation comprehension for security implementations and secure development practices.",
        "Dataset B tasks include policy-related investigations (e.g. private repository confidentiality, data collection policies).",
        "Dataset B tasks require understanding feature eligibility requirements and technical specifications for AI tools."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks in B require navigating enterprise-specific pricing details for specialized use cases (e.g., NLP projects, educational trials)",
        "Tasks in B involve finding implementation details for custom integrations (e.g., GraphQL API, project management workflows)",
        "Tasks in B require understanding technical compliance documentation (e.g., CSA STAR Certificate, data retention policies)",
        "Tasks in B focus on security implementation specifics (Copilot security measures, vulnerability mitigation strategies)",
        "Tasks in B require comparing extended Copilot plan features beyond basic tier comparisons (Autofix inclusion, Enterprise options)",
        "Tasks in B involve exploring system status/outage information and platform reliability checks",
        "Tasks in B require analyzing mobile app ecosystem integrations (Google Play Store ratings, GitHub Mobile features)",
        "Tasks in B focus on educational/classroom-specific implementations (autograding, educational trials)",
        "Tasks in B require deeper policy analysis (user-generated content terms, intellectual property rights)",
        "Tasks in B involve technical capability assessments for AI tools (Copilot limitations, Hyperlint AI extensions)"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Tasks in dataset B require understanding how to create or manage user projects, while dataset A focuses on locating existing repositories.",
        "Dataset B includes tasks related to troubleshooting project creation (e.g., rate limits/errors), which are absent in dataset A.",
        "Dataset B requires detailed comparisons of API technical implementations (REST vs GraphQL scalars), while dataset A compares product/service categories at a higher level.",
        "Tasks in dataset B involve navigating documentation for specific feature implementations (e.g., task lists, tables), whereas dataset A focuses on general documentation exploration.",
        "Dataset B contains tasks about GitHub Copilot's data sources/processing policies, while dataset A only addresses subscription options.",
        "Dataset B includes explicit instructions to locate security vulnerability details (CVE IDs), whereas dataset A focuses on general vulnerability remediation features.",
        "Tasks in dataset B require understanding plan upgrade paths (free to Pro), while dataset A only compares static plan features.",
        "Dataset B contains tasks about GitHub project view configurations (e.g., board layouts), which are absent in dataset A.",
        "Dataset B includes tasks about GitHub's trademark policies and account creation terms, while dataset A focuses on basic form interactions.",
        "Dataset B requires analysis of security advisory filtering (high severity) and disclosure processes, whereas dataset A focuses on vulnerability detection tools."
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Dataset B includes tasks focused on NCAA bowl game performance tracking, which are absent in Dataset A.",
        "Dataset A requires retrieving historical game schedules (e.g., December 25, 2023), while Dataset B emphasizes real-time or upcoming game tracking without specific historical dates.",
        "Dataset B includes navigation tasks for UEFA Conference League scores, which are not present in Dataset A's sports scope.",
        "Dataset B contains queries for fantasy basketball research (e.g., Detroit Pistons), whereas Dataset A focuses on general fantasy sports recommendations without team-specific analysis.",
        "Dataset A tasks involve checking player salaries (e.g., Boston Celtics roster salaries), a feature absent in Dataset B\u2019s sampled tasks.",
        "Dataset B requires locating postponed NHL game statuses (e.g., CGY vs. LA), while Dataset A focuses on live/completed game differentiation without explicit postponed game tracking.",
        "Dataset A includes NCAAW recruiting inquiries (e.g., top college recruits), which are not present in Dataset B\u2019s sampled tasks.",
        "Dataset B emphasizes real-time MLB game tracking (e.g., inning-specific updates like 'Top 6th'), whereas Dataset A\u2019s MLB tasks focus on final scores.",
        "Dataset B tasks involve identifying team lineups post-injury (e.g., Phoenix Suns after Kevin Durant\u2019s injury), while Dataset A focuses on general injury reports without lineup implications.",
        "Dataset B includes specific betting odds for NFL weeks (e.g., Week 17), whereas Dataset A tasks compare betting odds generically across leagues."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Dataset B includes tasks related to college football (NCAAF) and NFL, which are not present in Dataset A",
        "Dataset B requires navigation to fantasy sports betting tips and odds comparisons, while Dataset A focuses on Tournament Challenge brackets",
        "Dataset B tasks involve accessing international soccer/football content (e.g. Portuguese Primeira Liga, transfer rumors) not seen in Dataset A",
        "Dataset B contains requests for specific player trade simulations between different sports leagues, unlike Dataset A's team-specific queries",
        "Dataset B includes navigation through multiple league seasons (e.g. 2024-25 NBA season comparisons) while Dataset A focuses on current season data",
        "Dataset B tasks require accessing NFL Playoff Machine tools and schedule analysis features not present in Dataset A",
        "Dataset B involves direct comparisons of team performance across multiple seasons (e.g. Cavaliers vs Thunder comparisons) not found in Dataset A",
        "Dataset B contains requests for collegiate sports recruiting information (NCAAW recruiting) while Dataset A focuses on professional sports data",
        "Dataset B tasks require navigation through sports betting odds across multiple leagues (NFL/NCAAF comparisons) unlike Dataset A's betting odds links",
        "Dataset B includes navigation to podcast content (ESPN Radio NBA podcasts) not referenced in Dataset A tasks"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset A tasks focus on NBA, NCAA basketball, and MLB, while Dataset B emphasizes NFL, NCAA football, and soccer leagues like English Premier League",
        "Dataset B includes explicit requests for future schedules/events (e.g., 2024-25 College Football Playoff), while A focuses on historical/present-day game results",
        "Dataset B requires navigation of postseason-specific formats (bowl games, CFP bracket) not present in A's tournament challenge-focused tasks",
        "Dataset A contains more granular date-specific filtering requirements (e.g., exact dates like December 25) compared to B's week-based filtering (e.g., Week 18)",
        "Dataset B includes direct requests for betting odds analysis (e.g., AFC Champion odds) absent in A's tasks",
        "Dataset A features more player/team comparison tasks within singular sports, while B compares scores across different sports/games",
        "Dataset B requires navigation of international soccer league structures (EPL table, Serie A) not present in A's North American sports focus",
        "Dataset A contains specific requests for seasonal performance tracking (e.g., standings through 2023-24 season) where B emphasizes single-game statistics",
        "Dataset B includes explicit requirements for broadcast/streaming information lookup (ESPN+ content, channel listings) not found in A",
        "Dataset A tasks frequently require parsing complex statistics (BPI rankings, salary data) while B focuses more on score retrieval and schedule verification"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Dataset B includes tasks related to college football (NCAAF) bowl games and postseason matchups, which are not present in Dataset A.",
        "Dataset B requires navigation to NFL playoff scenarios and tiebreaker information, absent in Dataset A\u2019s tasks.",
        "Dataset A includes tasks involving NBA player salaries and injury reports, which are not mentioned in Dataset B.",
        "Dataset B contains requests for sports podcast discovery and ESPN Radio content, while Dataset A does not.",
        "Dataset A focuses on NBA-specific date-filtered tasks (e.g., December 25, 2023 games), whereas Dataset B emphasizes college football season schedules.",
        "Dataset B requires identification of NCAA transfer portal news between collegiate teams, not present in Dataset A.",
        "Dataset A tasks involve cross-sport comparisons (e.g., 'Golden' team names across NHL/NBA), while Dataset B focuses on single-sport team comparisons.",
        "Dataset B includes requests for historical player statistics (e.g., Michael Jordan\u2019s Wizards era), absent in Dataset A\u2019s contemporary player focus.",
        "Dataset B tasks require navigation to fantasy baseball rankings and projections, which Dataset A does not reference.",
        "Dataset A involves conference-specific recruiting information (e.g., NCAAW top recruits), while Dataset B focuses on active roster comparisons (e.g., NFL team rosters)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Dataset B tasks focus on NFL and NCAAF more prominently compared to Dataset A's NBA/NCAA basketball emphasis",
        "Dataset B includes explicit requests for draft results (e.g., 2024 NBA/NFL drafts) while Dataset A focuses on tournament challenges",
        "Dataset B contains tasks requiring multi-team trade analysis not present in Dataset A",
        "Dataset B tasks frequently reference specific weeks/seasons (e.g., '2024 NFL season', 'Week 16') unlike Dataset A's date-based queries",
        "Dataset B includes esports-related queries absent from Dataset A",
        "Dataset A tasks emphasize player-specific statistics breakdowns while Dataset B focuses more on team performance summaries",
        "Dataset B contains more requests for future/predictive information (playoff chances, schedules) compared to Dataset A's historical game focus",
        "Dataset B tasks require navigation through advertisement content more frequently than Dataset A",
        "Dataset A shows stronger emphasis on conference-specific standings analysis compared to Dataset B",
        "Dataset B includes lacrosse league queries not present in Dataset A's task samples"
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Tasks in dataset B require locating and interacting with academic research papers and their associated content (e.g., abstracts, HTML formats)",
        "Dataset B tasks involve explicit requests for commercial/enterprise usage validation (e.g., licensing for commercial products, business applications)",
        "Tasks in B demand technical environment setup guidance (e.g., Colab integration, library installation, CI/CD workflows)",
        "Dataset B contains tasks requiring multimedia content generation/processing (video-text models, image generation from text)",
        "B includes accessibility-focused technical requirements (e.g., CHI guidelines, HTML format conversion for papers)",
        "Tasks in B emphasize community-driven technical support acquisition (e.g., forum interactions, tutorial collaboration)",
        "Dataset B requires infrastructure configuration details (GPU usage, API endpoint setup, cloud pricing)",
        "B tasks involve granular dataset analysis (row counts, memory usage, specific data transformations like Parquet conversion)",
        "Dataset B contains explicit legal/compliance requirements (commercial licensing verification, citation mandates)",
        "Tasks in B require cross-platform integration (GitHub repository navigation, SDK downloads, external tool linking)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks frequently involve verifying licensing restrictions and commercial use compliance, whereas Dataset A focuses on general licensing awareness",
        "Dataset B requires cross-referencing with external research papers (arXiv) for model validation, while Dataset A uses internal documentation",
        "Dataset B includes tasks about third-party integrations (GitHub Copilot, LM Studio), unlike Dataset A's Hugging Face-specific API focus",
        "Dataset B emphasizes benchmark-specific performance validation (e.g. MMMU scores), while Dataset A references benchmarks more generally",
        "Dataset B tasks require identifying model architecture details and training methodologies, whereas Dataset A focuses on functional capabilities",
        "Dataset B contains explicit requests for dataset provenance tracking, while Dataset A focuses on dataset characteristics and popularity",
        "Dataset B includes tasks involving external deployment tools (GitHub Actions), unlike Dataset A's focus on Hugging Face Spaces",
        "Dataset B tasks address GDPR/data privacy compliance, which are absent in Dataset A requirements",
        "Dataset B requires comparison of models within specific application contexts (e.g. image classification spaces), while Dataset A comparisons are modality-based",
        "Dataset B tasks involve commercial product development research, whereas Dataset A focuses on research/experimental use cases"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset B tasks require retrieving model implementation details for specific technical environments (e.g., TensorRT-LLM backend numerical precision)",
        "Dataset B includes tasks involving multilingual documentation access (e.g., German installation instructions)",
        "Dataset B contains tasks requiring troubleshooting of model implementation errors (e.g., 'Task not found' error resolution)",
        "Dataset B emphasizes model version history tracking and comparison (e.g., codegen-350M-mono version history)",
        "Dataset B tasks involve searching for models with specific commercial integration capabilities (e.g., AI tools for commercial products)",
        "Dataset B requires identification of models with specialized technical requirements (e.g., Node.js CI/CD workflows through GitHub Actions)",
        "Dataset B includes tasks focused on low-resource language support (e.g., Indonesian sentiment analysis models)",
        "Dataset B contains tasks requiring cross-referencing with regional platforms (e.g., Zhihu/WeChat integration)",
        "Dataset B emphasizes model architecture understanding (e.g., Transformer inner workings documentation)",
        "Dataset B tasks involve accessing source code repositories for model implementation (e.g., ultravox model source code)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks emphasize model discovery for specific application use cases (e.g., dog breed classification, anime-style generation)",
        "Dataset B includes explicit tasks requiring SDK/package installation steps (e.g., Mac SDK installation)",
        "Dataset B contains error resolution tasks (e.g., 'Task not found' errors) not present in A",
        "Dataset B requires research paper retrieval linked to models (e.g., ModernBERT-base paper)",
        "Dataset B focuses on beginner-friendly educational resources (e.g., NLP tutorials) as primary targets",
        "Dataset B tasks involve commercial licensing awareness (e.g., commercial usage restrictions)",
        "Dataset B emphasizes model popularity metrics (download counts/likes) over technical performance metrics",
        "Dataset B requires cross-platform navigation (GitHub, arXiv) beyond Hugging Face ecosystem",
        "Dataset B contains multimedia-focused tasks (text-to-image, image-to-image models) as primary objectives",
        "Dataset B tasks prioritize model functionality understanding over technical configuration details"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset B tasks emphasize finding model implementation details and integration methods (e.g. local execution, format conversion) rather than just status metrics",
        "Dataset B contains explicit requirements to verify ethics compliance and usage licenses before model adoption",
        "Dataset B includes tasks focused on academic research artifacts (paper finding, tutorial navigation) rather than just practical implementations",
        "Dataset B tasks require navigation of Chinese-language resources and models (e.g. Qwen series, WeChat integration)",
        "Dataset B emphasizes commercial application scenarios (product development, business use cases) more prominently",
        "Dataset B contains specific technical conversion tasks between data formats (e.g. Parquet conversion) not present in A",
        "Dataset B requires interaction with community resources (forums, Zhihu) for problem solving",
        "Dataset B includes optimization-focused tasks (CPU inference optimization, performance benchmarking)",
        "Dataset B contains more chatbot-specific interaction tasks (medical QAs, conversational interfaces)",
        "Dataset B tasks require navigation of multimodal integration documentation (text+image+audio workflows)"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Dataset B tasks emphasize verifying median salary figures and job availability counts for career roles, while Dataset A focuses on course rating percentages and review counts.",
        "Dataset B includes navigation tasks requiring identification of AI-specific skills within professional certificates (e.g., Generative AI Engineering), whereas Dataset A focuses on general AI/ML content without explicit skill tagging.",
        "Dataset B tasks involve locating time-bound promotional offers with specific annual pricing (e.g., $199/year), while Dataset A references monthly billing ($25/month).",
        "Dataset B requires exploration of domain-specific certifications like 'Marketing with TikTok' and 'Medical Office Manager,' whereas Dataset A emphasizes broader domains like Data Science or Business.",
        "Dataset B tasks demand verification of newer platform features like 'AI Python for Beginners' courses, while Dataset A references established features like Guided Projects.",
        "Dataset B includes tasks to identify credentials tied to niche roles (e.g., Social Media Strategist), while Dataset A focuses on mainstream roles like Data Analyst or Cyber Security Analyst.",
        "Dataset B tasks require filtering by career-relevant metrics like 'job-ready certificates with AI skills,' whereas Dataset A prioritizes technical filters like duration or credit eligibility.",
        "Dataset B emphasizes degree pathways with pricing transparency (e.g., 'Breakthrough pricing on 100% online degrees'), while Dataset A highlights general degree program availability.",
        "Dataset B tasks involve identifying free project-based learning (e.g., 'Multimodal Llama 3.2'), whereas Dataset A focuses on free courses with skill tags.",
        "Dataset B requires comparison of specialized collections like 'Popular certificates with new AI skills,' while Dataset A compares generic professional certificates (e.g., Google vs. IBM)."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Dataset B tasks emphasize social impact and human rights-related course queries, while Dataset A focuses on technical and career-specific skills.",
        "Tasks in Dataset B include inquiries about application processes and university admissions, absent in Dataset A tasks.",
        "Dataset B tasks frequently involve searching for courses based on ethical or societal impact (e.g., 'finance for social good'), whereas Dataset A prioritizes quantitative metrics like ratings/duration.",
        "Dataset A requires granular review analysis (e.g., percentage breakdown of star ratings), while Dataset B focuses on general course descriptions.",
        "Dataset B includes language-specific course queries (e.g., 'courses in German'), whereas Dataset A emphasizes language requirements for accessibility.",
        "Tasks in Dataset B explore broader learning goals like 'arts courses' or 'translation for language learning,' while Dataset A targets credential-driven outcomes (certifications/degrees).",
        "Dataset A tasks demand explicit price/certification comparisons (free vs paid), while Dataset B emphasizes free course discovery without cost analysis.",
        "Dataset B contains exploratory tasks without filters (e.g., 'Explore Coursera courses'), whereas Dataset A requires structured hierarchical navigation (modules/assessments).",
        "Dataset A tasks involve prerequisite validation (e.g., 'experience level'), while Dataset B focuses on skill applicability (e.g., 'recommended experience for Data Analyst roles').",
        "Dataset B includes career path exploration queries (e.g., 'career development opportunities'), while Dataset A emphasizes direct job-ready skill extraction (salary data/role requirements)."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset A tasks require extracting exact quantitative metrics (e.g., star rating percentages, review counts), while Dataset B focuses on qualitative skill/knowledge outcomes from courses",
        "Dataset A includes explicit filtering by 'Credit Eligible' status in course searches, whereas Dataset B tasks never reference academic credit eligibility",
        "Dataset B tasks frequently involve investigating course curricula/modules (e.g., 'first two modules of Google PM course'), while Dataset A focuses on surface-level metadata",
        "Dataset A requires identification of instructor backgrounds and university partnerships, while Dataset B emphasizes career path requirements and role definitions",
        "Dataset B contains explicit enrollment/sign-up actions as task requirements, unlike Dataset A which focuses purely on information retrieval",
        "Dataset A tasks demand comparison of multiple programs' duration/salary metrics, while Dataset B emphasizes single-course content analysis",
        "Dataset B includes foundational knowledge queries (e.g., 'definition of data analytics'), which are absent from Dataset A's task requirements",
        "Dataset A requires validation of institution-country relationships (e.g., Australian partners), while Dataset B focuses on course-language availability",
        "Dataset B tasks involve cross-referencing course content with external career requirements, whereas Dataset A matches courses to technical skill filters",
        "Dataset A emphasizes free resource identification through tag searching, while Dataset B prioritizes paid credential program structures"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B emphasizes job-ready certificates with AI-enhanced skills (e.g., Google Professional Certificates with AI components) across all samples, while Dataset A focuses on traditional certification paths.",
        "Dataset B includes navigation tasks requiring identification of career outcome metrics (e.g., median salaries, job availability counts) for roles, whereas Dataset A focuses on course quality metrics like ratings/reviews.",
        "Dataset B features newer AI-focused content categories (e.g., GenAI, AI Agents for Leaders) in all samples, while Dataset A shows no equivalent AI specialization focus.",
        "Dataset B contains navigation tasks for hands-on project-based learning (e.g., Hugging Face projects) across all samples, while Dataset A emphasizes theoretical module completion.",
        "Dataset B requires differentiation between university-offered vs. corporate-offered programs in all queries, while Dataset A tasks treat institutional partners uniformly.",
        "Dataset B includes price anchoring references ($199/year Coursera Plus) in promotional elements across all samples, while Dataset A uses different pricing models.",
        "Dataset B navigation tasks frequently involve career academy role pathways (e.g., Data Analyst/Scientist role pages) as primary content organization, unlike Dataset A's discipline-based organization.",
        "Dataset B emphasizes credit applicability toward degrees more prominently across all samples (e.g., 'Build toward a degree' tags), while Dataset A focuses on standalone course attributes.",
        "Dataset B contains time-bound promotional content (e.g., New Year savings offers) across all samples, while Dataset A shows evergreen pricing structures.",
        "Dataset B navigation tasks require filtering by emerging skill categories (e.g., 'Prompting Essentials') in all samples, while Dataset A uses established skill taxonomies."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require detailed curriculum exploration for degree programs (e.g., course sequences, concentrations)",
        "Dataset B emphasizes enrollment process details (admission requirements, credit transfer policies)",
        "Tasks in B demand identification of course-specific technical tools (e.g., Electric VLSI EDA Tool)",
        "Dataset B queries focus on program prerequisites rather than general skill levels",
        "B requires analysis of course refund policies and subscription terms",
        "Tasks in B involve direct sign-up/registration process navigation",
        "Dataset B emphasizes career path alignment with specific job roles",
        "B tasks require identification of course specializations within degree programs",
        "Dataset B queries focus on standardized test preparation (e.g., GRE-related content)",
        "B requires comparison of program structures rather than provider types"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Dataset B tasks require deeper interaction with full-text content (e.g., specific sections, references) rather than just metadata extraction",
        "Dataset B contains tasks involving document structure analysis (e.g., TeX files, HTML formatting, licensing information)",
        "Dataset B includes troubleshooting tasks related to technical paper presentation (e.g., HTML conversion errors)",
        "Dataset B tasks more frequently require understanding of paper formatting standards (TeX/LaTeX/MathML)",
        "Dataset B emphasizes direct content engagement (e.g., 'read the related work section') rather than quantitative analysis",
        "Dataset B contains tasks requiring navigation through paper versions/format comparisons (HTML vs. PDF vs. source)",
        "Dataset B tasks involve locating and interpreting copyright/licensing information for papers",
        "Dataset B requires following internal document references (e.g., finding specific sections within papers)",
        "Dataset B includes tasks focused on paper preservation/archiving aspects (e.g., download formats)",
        "Dataset B tasks demand understanding of academic document structure (abstracts, references, methodology sections)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Dataset A tasks focus on precise metadata extraction (e.g., version dates, author counts) while Dataset B emphasizes content comprehension (e.g., abstracts, introductions)",
        "Dataset A includes time-bound queries requiring date filters; Dataset B lacks explicit date constraints in most tasks",
        "Dataset A tasks require cross-referencing multiple UI elements (e.g., category counts + random selection); Dataset B focuses on linear retrieval",
        "Dataset A contains interface interaction tasks (e.g., shopping cart additions) absent in Dataset B",
        "Dataset B tasks frequently involve citation/reference navigation while Dataset A focuses on primary document properties",
        "Dataset A requires comparison between category-specific vs archive-wide searches; Dataset B focuses on single-category exploration",
        "Dataset B includes format-specific access tasks (e.g., HTML vs PDF) more prominently than Dataset A",
        "Dataset A tasks demand quantitative analysis of search results; Dataset B emphasizes qualitative understanding",
        "Dataset B contains more interdisciplinary topic queries (e.g., AI+Quantum) compared to Dataset A's domain-specific focus",
        "Dataset A includes institutional affiliation tracing tasks absent in Dataset B"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset B requires accessing specific sections within papers (e.g., 'Results', 'Methodology') for task completion, while Dataset A focuses on metadata extraction (e.g., submission dates, version history).",
        "Tasks in Dataset B involve direct interactions with paper formats (e.g., HTML accessibility-friendly views), whereas Dataset A emphasizes multi-format availability without explicit format-based actions.",
        "Dataset B includes tasks requiring troubleshooting (e.g., resolving layout errors during submission), absent in Dataset A.",
        "Dataset B tasks reference arXiv license types and copyright policies, while Dataset A does not address legal or usage terms.",
        "Dataset B requires locating papers via exact arXiv IDs (e.g., 'arXiv:2412.18601'), whereas Dataset A relies on keyword/date filters for retrieval.",
        "Dataset B tasks involve author-specific searches (e.g., 'Ariel Shlosberg'), while Dataset A focuses on general author lists without individual attribution.",
        "Dataset B tasks demand citation tracking (e.g., 'gather information about citing a specific paper'), unlike Dataset A.",
        "Dataset B includes navigating paper structure (e.g., 'Related Work' sections), while Dataset A focuses on locating specialized sections (e.g., abstract) without structural traversal.",
        "Dataset B tasks require validating external criteria (e.g., conference acceptances) post-retrieval, whereas Dataset A cross-references criteria during search.",
        "Dataset B tasks involve post-download actions (e.g., 'add to cart' for merchandise), absent in Dataset A's metadata-centric workflows."
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset A tasks emphasize metadata aggregation across multiple papers (e.g., counts per announce day), while B focuses on granular content extraction from single papers (e.g., specific sections/references)",
        "Dataset A requires date range filtering as a core task component, while B prioritizes direct content access without temporal constraints",
        "Dataset B tasks involve explicit format handling (PDF/HTML selection, source code downloads), whereas A focuses on abstract metadata retrieval",
        "Dataset A tasks require cross-category comparison (e.g., main vs. nested archives), while B concentrates on intra-paper navigation",
        "Dataset B contains tasks requiring citation/reference chain navigation (e.g., 10th reference lookup), absent in A's requirements",
        "Dataset A includes random sampling from result sets as a task pattern, while B emphasizes deterministic content retrieval",
        "Dataset B tasks demand understanding of paper structure (methods/results/figures), whereas A focuses on system-level organization (categories/versions)",
        "Dataset A requires affiliation/conference cross-referencing, while B focuses on self-contained paper analysis",
        "Dataset B includes supplementary material access (source code/data), not present in A's task specifications",
        "Dataset A tasks involve multi-step filtering combinations (category + date + keyword), while B emphasizes direct keyword-to-content paths"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Dataset B tasks require downloading specific paper formats (e.g., PDF, TeX) more frequently than Dataset A",
        "Dataset B includes tasks involving troubleshooting submission processes or technical issues not present in Dataset A",
        "Dataset B tasks demand direct interaction with paper content (e.g., methodology sections, problem statements) rather than just metadata",
        "Dataset B contains tasks requiring identification of formatting issues in papers' content",
        "Dataset B tasks more frequently specify exact paper titles for retrieval compared to Dataset A's keyword-based searches",
        "Dataset B includes requests for source code availability related to research papers",
        "Dataset B tasks require cross-referencing citations within papers' reference sections",
        "Dataset B contains requests for author background information and publication histories",
        "Dataset B tasks involve analysis of paper appendices/supplementary materials more frequently",
        "Dataset B includes tasks requiring comparison of multiple paper versions for content changes"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Dataset B tasks require handling interactive elements (e.g., surveys/pop-ups) before accessing content",
        "Dataset B contains time-sensitive sports score tracking requirements not present in A",
        "Dataset B tasks involve locating multimedia content (podcasts/video archives) as primary objectives",
        "Dataset B requires navigation through temporary/promotional content sections (e.g., 'Quiz of the Year')",
        "Dataset B tasks demand identification of content through visual timeline features for historical events",
        "Dataset B contains tasks requiring differentiation between live event coverage vs archival reporting",
        "Dataset B tasks involve parsing compound geographic-temporal patterns (e.g., holiday weather impacts)",
        "Dataset B requires identification of content through author expertise indicators (e.g., 'BBC reporter looks back')",
        "Dataset B tasks demand recognition of recurring special report formats (e.g., annual celebrity flops list)",
        "Dataset B contains requirements to navigate through layered opinion/analysis vs straight news reporting"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks in B require locating future event schedules (e.g., 'February 2025 matches') while A focuses on retrospective timestamps (e.g., '2 hrs ago')",
        "B includes multimedia exploration tasks beyond articles (e.g., podcast episodes, video libraries) whereas A primarily targets embedded article multimedia",
        "B tasks involve navigating service-oriented sections (e.g., hotel info, education courses) absent from A's news-centric taxonomy",
        "B requires joining external groups/organizations (e.g., camera clubs) while A focuses solely on information consumption",
        "B contains real-time financial queries (e.g., Bitcoin price checks) vs A's static article metadata extraction",
        "B tasks demand structural website familiarity (e.g., 'explore layout') whereas A assumes pre-existing navigation hierarchy knowledge",
        "B includes open-ended exploratory goals (e.g., 'casually browse') contrasting with A's strictly defined information retrieval",
        "B requires precision weather/time queries (e.g., 'Bras\u00edlia forecast at 06:00') vs A's regional section access",
        "B tasks span educational/professional domains (e.g., university courses) beyond A's journalistic content scope",
        "B involves multi-platform navigation (e.g., article\u2192video trajectories) while A maintains linear hierarchical paths"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset B requires navigation through science/space exploration content absent in A",
        "Dataset B contains tasks involving BBC Sounds podcast navigation not present in A",
        "Dataset B includes specific event-based navigation (e.g., COP29, 2004 tsunami) rather than category-based",
        "Dataset B requires handling of multilateral geopolitical conflicts (Syria, Russia-Finland-Estonia)",
        "Dataset B features environmental impact analysis tasks beyond climate guides present in A",
        "Dataset B contains niche cultural navigation (urban sketching retreats, literary travel destinations)",
        "Dataset B includes real-time weather forecast comparisons across multiple cities",
        "Dataset B requires medical breakthrough identification (Crohn's disease treatment)",
        "Dataset B integrates accommodation booking tasks within navigation flows",
        "Dataset B incorporates social media trend analysis (TikTok underconsumption) in task requirements"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Tasks in Dataset B require navigating through specialized content areas like Formula 1, graphene technology, and NASA missions not present in Dataset A",
        "Dataset B tasks involve accessing podcast episodes (e.g., Sporting Giants) and live event streaming options not emphasized in Dataset A",
        "Navigation in Dataset B requires interaction with time-specific event calendars (e.g., January 2025 football fixtures) rather than relative timestamps alone",
        "Dataset B tasks include locating academic course information (e.g., Cardiac Physiology degrees) requiring education-related navigation paths",
        "Tasks in Dataset B demand navigation through sustainability-focused content areas (e.g., fabric recycling in fashion) not explicitly required in Dataset A",
        "Dataset B requires interaction with donation forms/systems (e.g., Christmas charity donations) absent from Dataset A tasks",
        "Navigation in Dataset B involves accessing specialized weather features (hour-by-hour forecasts for multiple cities) with interactive map components",
        "Dataset B tasks require locating military conflict prisoner information (e.g., Guantanamo detainees) needing historical archive navigation",
        "Tasks in Dataset B involve cryptocurrency market tracking (e.g., Bitcoin price movements) requiring financial data navigation layers",
        "Dataset B requires navigation through technology ethics content (e.g., AI in sports) with specialized subsection structures"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks in B require accessing multimedia archives (e.g., historical videos/podcasts) rather than only current articles",
        "B includes instructions involving date-specific future events (e.g., 'January 1, 2025' cricket matches)",
        "B tasks require navigation to localized UK regional sections (e.g., Tayside & Central Scotland) beyond continental regions",
        "B contains tasks about educational/academic content (e.g., university course details)",
        "B requires identifying specialized project-based initiatives (e.g., Recetas nature remedies project)",
        "B includes instructions to verify information across non-adjacent sections (e.g., cybersecurity human aspects)",
        "B tasks involve accessing hospitality/service industry content (e.g., 9h hotel sleep analysis)",
        "B requires navigation through entertainment biopic content (e.g., Maria Callas role research)",
        "B contains tasks about niche cultural phenomena (e.g., 1970s American gay club dance styles in India)",
        "B includes instructions to locate specific numerical market data (e.g., Premier League fixture dates)"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset B tasks prioritize finding gifts or generic items (e.g., 'gift for a female') while Dataset A requires precise attribute-based searches (e.g., 'USB-C hub with HDMI').",
        "Dataset B includes tasks focused on pre-owned/luxury items (e.g., 'pre-loved Louis Vuitton handbag') not explicitly mentioned in Dataset A.",
        "Dataset B tasks often involve purchasing multiple items (e.g., 'Buy multiple Harry Potter books') compared to single-item actions in Dataset A.",
        "Dataset B includes explicit requests for discounts/coupons (e.g., 'Get a discount on Cellucor pre workout powder') while Dataset A focuses on sale identification without coupon specifics.",
        "Dataset B tasks reference niche categories like gourmet foods and protection plans (e.g., 'high-end floor lamp including protection plan') absent in Dataset A.",
        "Dataset A requires validating detailed technical specifications (e.g., 'compatibility with MacBook Pro') whereas Dataset B focuses on basic price comparisons.",
        "Dataset B contains more open-ended exploration tasks (e.g., 'Explore gourmet food products') compared to Dataset A's structured category navigation.",
        "Dataset A tasks frequently require cross-referencing review metrics (e.g., 'over 20,000 reviews') while Dataset B only mentions basic star ratings.",
        "Dataset B includes bulk/quantity-based purchasing (e.g., 'Add some office supplies') whereas Dataset A specifies exact quantities.",
        "Dataset A emphasizes logistical verification (e.g., 'return policy for Mens Shirt') while Dataset B focuses on purchase completion without policy checks."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks in dataset B require less granular product attribute specification compared to dataset A's explicit size/color/price combinations",
        "Dataset B includes tasks focused on service utilization (Prime Video, Amazon Fresh) absent in dataset A",
        "Dataset A tasks emphasize precise price range constraints while B allows open-ended price exploration",
        "Dataset B contains more generic purchase actions without product specifications ('Buy one item')",
        "Dataset A requires multi-criteria filtering while B focuses on single-attribute comparisons",
        "Dataset B includes brand-specific searches (Aquazzura, Louis Vuitton) without detailed specifications",
        "Dataset A tasks demand explicit sorting mechanisms (Best Sellers) while B uses implicit sorting",
        "Dataset B contains service-oriented actions (movie rentals, account creation) beyond product transactions",
        "Dataset A requires availability checks and return policy verification absent in B's tasks",
        "Dataset B includes vague objectives ('Browse for products') versus A's concrete product requirements"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Dataset B tasks emphasize luxury brand interactions (e.g., Oscar de la Renta, Louis Vuitton) while Dataset A focuses on general consumer goods",
        "Dataset B includes explicit CAPTCHA verification workflows during navigation, while Dataset A only implies error recovery scenarios",
        "Dataset B tasks require interaction with Amazon Fresh Grocery categories (e.g., fresh fruits), absent in Dataset A",
        "Dataset B contains time-sensitive Prime benefits like Grubhub $0 delivery fees, not present in Dataset A",
        "Dataset B emphasizes seasonal campaigns (Winter Sale, New Year themes) while Dataset A focuses on evergreen deals",
        "Dataset B tasks involve specific named products (e.g., 'e.l.f. Glow Reviver Lip Oil') rather than generic product searches common in Dataset A",
        "Dataset B includes gift-focused tasks (graduation cards, gift baskets) while Dataset A focuses on personal purchases",
        "Dataset B requires navigation through Amazon's premium services (Luxury Stores, Shopbop) not mentioned in Dataset A",
        "Dataset B tasks reference future-dated events (NFL Wild Card January 2025) while Dataset A uses current timeframes",
        "Dataset B contains explicit content format requirements (audiobooks, Prime Video) whereas Dataset A focuses on physical products"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Dataset B tasks focus on purchasing general product categories without specific attribute filters (e.g., 'Buy pet supplies for a puppy'), while Dataset A requires granular attribute-based filtering (e.g., thickness, size).",
        "Dataset B tasks include open-ended actions like 'Add 3 items to cart' without item specifications, whereas Dataset A cart additions target exact products (e.g., 'Blue iPhone 12 Pro 128GB').",
        "Dataset B tasks prioritize price identification as a primary goal (e.g., 'Find the price of bananas'), while Dataset A tasks emphasize price comparison or budget constraints within detailed criteria.",
        "Dataset B tasks involve broader exploratory goals (e.g., 'Find eco-friendly kitchen products'), whereas Dataset A tasks focus on verifying availability of exact variants (e.g., color/size).",
        "Dataset B tasks frequently reference seasonal/event-based promotions (e.g., 'Winter Sale under $50'), while Dataset A tasks center on permanent sale filters (e.g., 'under $30').",
        "Dataset B tasks include vague objectives like 'Buy gifts for a woman' without category constraints, while Dataset A tasks specify category-specific browsing (e.g., 'electronics').",
        "Dataset B tasks target generic product discovery (e.g., 'Find the most expensive items on sale'), whereas Dataset A tasks require sorting/filtering by metrics like 'Best Sellers' or 'newest arrivals'.",
        "Dataset B tasks lack explicit verification of policies (e.g., return/delivery), while Dataset A tasks explicitly require checking these details.",
        "Dataset B tasks mention brand names without technical specifications (e.g., 'Find Hasbro toy game'), while Dataset A combines brand identification with technical requirements (e.g., 'Samsung tablet with 10-10.9-inch screen').",
        "Dataset B tasks include ambiguous actions like 'Find out more info about Luxury Store Louis Vuitton', while Dataset A tasks demand concrete outcomes (e.g., 'Only answer the cheapest one')."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Dataset B tasks involve broader queries with fewer specific filters compared to Dataset A's granular attribute requirements (e.g., 'Find eco-friendly products' vs '5mm purple yoga mat').",
        "Dataset B includes exploratory instructions without defined outcomes (e.g., 'Browse fashion', 'Find birthday gift ideas'), while Dataset A requires concrete product matches.",
        "Dataset B tasks frequently omit numeric constraints for price ranges/sizes seen in Dataset A (e.g., 'under $30' vs 'Find the price of...').",
        "Dataset B contains meta-interactions like testing CAPTCHA functionality absent in Dataset A's product-focused tasks.",
        "Dataset A requires cross-referencing multiple product attributes simultaneously (color+size+rating+price), while Dataset B often isolates single criteria.",
        "Dataset B includes vague quantity instructions (e.g., 'Add 3 gourmet food items') without quality filters present in Dataset A's cart tasks.",
        "Dataset A emphasizes temporal specificity (e.g., '2024 publications', 'released within a month') absent in Dataset B's tasks.",
        "Dataset B contains more open-ended discovery tasks (e.g., 'Find unique gifts under $20') vs Dataset A's structured multi-filter searches.",
        "Dataset A requires policy verification steps (return/shipping details) as mandatory task components, while Dataset B mentions policies incidentally.",
        "Dataset B includes brand queries without model/specification requirements present in Dataset A (e.g., 'Wahl shampoo' vs 'MacBook-compatible USB-C hub')"
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Tasks in dataset B require basic arithmetic or simple equation solving (e.g., '3 times 5', '4x + 3 = 19'), whereas dataset A focuses on advanced computations (e.g., differential equations, series convergence).",
        "Dataset B includes tasks requesting general information retrieval (e.g., 'Find information on Spain's foreign debt'), while dataset A emphasizes structured scientific data extraction (e.g., material properties, unit conversions).",
        "Tasks in dataset B explicitly ask for conceptual explanations (e.g., 'explain step-by-step', 'research paradoxes'), whereas dataset A assumes prior conceptual knowledge and focuses on application.",
        "Dataset B contains exploratory tasks about Wolfram Alpha itself (e.g., 'Explore Wolfram Alpha capabilities'), which are absent in dataset A.",
        "Dataset B includes linguistic or definition-based queries (e.g., 'analyze the word \"hello\"', 'define triangulate'), while dataset A focuses strictly on technical domains.",
        "Tasks in dataset B often lack real-world parameter constraints (e.g., 'solve t² + 3t + 2 = 0'), whereas dataset A specifies contextual details (e.g., age, weight, time).",
        "Dataset B features basic statistical queries (e.g., 'mean of beta distribution'), while dataset A involves complex statistical distributions and probability metrics (e.g., convergence tests).",
        "Dataset B includes historical or social science inquiries (e.g., 'Industrial Revolution', 'COVID-19'), whereas dataset A prioritizes STEM-focused scenarios.",
        "Tasks in dataset B request atomic factual outputs (e.g., 'atomic mass of mercury'), while dataset A often requires multi-part answers (e.g., mass comparisons with auxiliary calculations).",
        "Dataset B contains tasks about generating functions or mathematical definitions (e.g., 'generating function of A000108'), whereas dataset A emphasizes applied mathematical problem-solving (e.g., simplifying polynomials)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Dataset A tasks require immediate numerical results for real-world scenarios (e.g., fat burned, Jupiter's mass).",
        "Dataset B tasks emphasize conceptual exploration (e.g., Johnson solids, Riemann Hypothesis).",
        "Dataset A tasks often include personalized parameters (e.g., age, weight, time).",
        "Dataset B tasks focus on retrieving structured factual data (e.g., mountain elevations, COVID-19 trends).",
        "Dataset A tasks prioritize step-by-step solutions for complex equations (e.g., ODEs, polynomial simplifications).",
        "Dataset B tasks involve broader domain exploration (e.g., etymology, climate models, historical events).",
        "Dataset A tasks use hypothetical or generalized inputs (e.g., \"assume 300g servings\").",
        "Dataset B tasks explicitly request downloadable outputs (e.g., plots, datasets).",
        "Dataset A tasks target real-time or current data (e.g., 2023 prices, currency conversions).",
        "Dataset B tasks include meta-navigation (e.g., exploring Wolfram Alpha's features or language capabilities)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Dataset B tasks focus on data retrieval (e.g., temperature anomalies, moon phases) while Dataset A emphasizes applied computational problem-solving.",
        "Dataset B includes tasks requiring conceptual explanations (e.g., paradoxes, thermodynamics), whereas Dataset A prioritizes numerical or quantitative results.",
        "Dataset B contains basic mathematical queries (e.g., quadratic equations, factorial calculations) while Dataset A involves advanced calculus (e.g., ODEs, series convergence).",
        "Dataset B tasks frequently involve statistical distributions (e.g., beta, normal) while Dataset A emphasizes unit conversions and material properties.",
        "Dataset B includes exploratory or research-oriented tasks (e.g., etymology, paradox exploration) while Dataset A focuses on real-world applications (e.g., finance, health metrics).",
        "Dataset B tasks explicitly request step-by-step solutions for elementary math, while Dataset A integrates multi-step decomposition in complex domains (e.g., physics, engineering).",
        "Dataset B includes astronomical/geographical data queries (e.g., solar eclipses, sunrise times) absent in Dataset A, which centers on engineering or thermodynamic scenarios.",
        "Dataset A tasks involve comparative analyses (e.g., food calorie comparisons) while Dataset B focuses on singular data points or definitions.",
        "Dataset B incorporates linguistic or etymological queries (e.g., word origins) not present in Dataset A.",
        "Dataset A tasks require domain-specific scientific knowledge (e.g., material conductivity, spring pendulum physics) whereas Dataset B includes general science overviews (e.g., carbon properties)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Dataset B tasks focus on exploratory information retrieval (e.g., definitions, etymologies, historical facts) rather than applied computational outputs.",
        "Dataset B includes explicit requests for platform metadata (e.g., pricing, feature descriptions, resource navigation) absent in Dataset A.",
        "Dataset B emphasizes educational tool discovery (e.g., math problem generators, homework help interfaces) as primary objectives.",
        "Dataset B tasks frequently involve pure mathematical concept exploration (e.g., equation roots, polynomial structures) without real-world parameterization.",
        "Dataset B requires explicit documentation lookups (e.g., function definitions, theorem explanations) rather than implicit knowledge application.",
        "Dataset B contains financial planning workflows (e.g., mortgage research, investment analysis) instead of isolated financial conversions/comparisons.",
        "Dataset B tasks demonstrate stronger emphasis on data persistence actions (e.g., 'save it', 'download properties') alongside computations.",
        "Dataset B includes procedural learning objectives (e.g., 'how is it calculated', 'review solution steps') as explicit task requirements.",
        "Dataset B features platform capability audits (e.g., 'determine if allowed to disseminate') not present in Dataset A's assumption of tool access.",
        "Dataset B tasks prioritize theoretical physics/mechanics concepts over Dataset A's applied engineering/physics problem-solving"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Dataset B tasks focus on retrieving factual properties (e.g., element isotopes, boiling points) rather than performing multi-variable real-world computations like Dataset A.",
        "Dataset B tasks involve simpler mathematical operations (e.g., solving linear equations) compared to Dataset A's advanced calculus (e.g., differential equations, series convergence).",
        "Dataset B tasks prioritize direct data queries (e.g., GDP, chemical properties) over Dataset A's scenario-based calculations (e.g., calorie burn, financial projections).",
        "Dataset B tasks include exploratory or feature-testing goals (e.g., 'Explore Wolfram Alpha features') absent in Dataset A's focused problem-solving tasks.",
        "Dataset B tasks lack explicit emphasis on comparative analyses (e.g., food item comparisons) common in Dataset A.",
        "Dataset B tasks require basic arithmetic or algebraic solutions (e.g., prime factorization) instead of Dataset A's statistical or probabilistic computations (e.g., averages, distributions).",
        "Dataset B tasks frequently reference output formatting (e.g., TeX, image downloads) for results, while Dataset A emphasizes numerical precision (e.g., scientific notation).",
        "Dataset B tasks focus on single-entity information retrieval (e.g., element properties) rather than Dataset A's multi-parameter personalized calculations (e.g., age, weight, time).",
        "Dataset B tasks include navigation-oriented queries (e.g., 'Find examples of polynomial-related information') not present in Dataset A.",
        "Dataset B tasks lack explicit requests for step-by-step visualizations (e.g., plots) common in Dataset A's problem-solving workflows."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Dataset A tasks require filtering by exact review counts (e.g. >1000 reviews) while B uses qualitative metrics",
        "Dataset A emphasizes numeric rating thresholds (4+ stars) while B focuses on general popularity",
        "Dataset A specifies precise prep/cook time constraints (<30min) while B uses vague time indicators",
        "Dataset A tasks involve creating shopping lists while B focuses on recipe discovery/saving",
        "Dataset A requires ingredient-specific searches (zucchini, quinoa) while B uses categorical ingredients",
        "Dataset B includes leftover ingredient utilization tasks absent in A",
        "Dataset B contains kid-friendly recipe requirements not present in A",
        "Dataset A emphasizes nutritional constraints (calorie counts) while B focuses on dietary labels",
        "Dataset B includes specific cuisine requests (Thai desserts) while A uses broader categories",
        "Dataset A tasks involve detailed recipe analysis while B focuses on general meal planning"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks in Dataset B require interaction with user-generated content like leaving reviews, not just reading them.",
        "Dataset B includes tasks that involve finding non-recipe items (e.g., kitchen appliances, gingerbread house kits).",
        "Dataset B tasks focus on meal planning for events/holidays rather than individual recipe retrieval.",
        "Dataset B tasks explicitly require ingredient substitutions (e.g., finding alternatives for evaporated milk).",
        "Tasks in Dataset B involve comparing nutritional information across recipes, not just listing it.",
        "Dataset B tasks are broader (e.g., 'Christmas recipes') without strict rating/review count thresholds.",
        "Dataset B includes creating multi-dish menus (e.g., holiday party planning) rather than single recipes.",
        "Dataset B tasks emphasize meal prep for weekly planning, not immediate cooking needs.",
        "Dataset B tasks integrate product recommendations (e.g., kitchen tools) alongside recipes.",
        "Dataset B includes event-specific setup tasks (e.g., hot chocolate bars) beyond recipe execution."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Tasks in B require saving multiple recipes in a single action (e.g., 'Save at least three recipes'), while A focuses on saving/bookmarking individual recipes.",
        "B includes tasks centered on modifying existing recipes (e.g., substitutions for ingredients), whereas A emphasizes strict adherence to recipe constraints without modifications.",
        "Tasks in B involve explicit user interactions like leaving reviews/ratings, while A focuses on extracting or parsing existing reviews.",
        "B contains holiday/event-specific recipe searches (e.g., Christmas desserts, New Year appetizers), while A targets general categories like 'Dinners' or 'Desserts'.",
        "B includes tasks for kid-friendly or family-oriented recipes (e.g., 'kid-friendly snack recipe'), which are not explicitly prioritized in A.",
        "Tasks in B require comparing nutritional data across recipes (e.g., 'Compare nutritional information'), while A focuses on parsing metadata within a single recipe.",
        "B features tasks related to meal prep planning (e.g., 'Explore recipe ideas for meal prep'), whereas A emphasizes immediate recipe execution.",
        "Tasks in B explore cuisine-specific recipes (e.g., 'Indian or Asian theme'), while A focuses on broader dietary labels like 'vegetarian' or 'gluten-free'.",
        "B includes searches for ingredient substitutes (e.g., 'substitute for evaporated milk'), which are absent in A's task requirements.",
        "Tasks in B prioritize seasonal/trending collections (e.g., 'holiday baking tips'), while A emphasizes static curated collections like 'Trending Now'."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Dataset B tasks frequently involve leaving user reviews or interacting with recipe feedback mechanisms, while Dataset A focuses more on retrieving existing reviews.",
        "Dataset B includes navigation tasks centered around specific dietary restrictions (e.g., keto, gluten-free, low-carb) not explicitly emphasized in Dataset A.",
        "Dataset B requires users to locate nutritional facts/calorie counts as part of task requirements, whereas Dataset A focuses on general nutritional information availability.",
        "Dataset B contains tasks explicitly mentioning meal prep planning and ingredient substitution needs, while Dataset A focuses on immediate recipe execution.",
        "Dataset B includes recipe modification tasks (e.g., 'use leftovers') as a core requirement, whereas Dataset A focuses on finding complete recipes.",
        "Dataset B emphasizes holiday/event-specific recipe discovery beyond general seasonal categories (e.g., Halloween snacks, Thanksgiving appetizers), while Dataset A focuses on broader holiday categories.",
        "Dataset B tasks require interaction with appliance-specific filters (e.g., air fryer, slow cooker) that aren't prominent in Dataset A's navigation requirements.",
        "Dataset B includes budget-conscious task parameters (e.g., 'cheap recipes', 'Amazon deals') not explicitly present in Dataset A's samples.",
        "Dataset B tasks frequently involve multi-recipe coordination (e.g., '7 dinner recipes for a week'), while Dataset A focuses on single-recipe retrieval.",
        "Dataset B emphasizes kid-friendly meal requirements and age-specific dietary considerations in task parameters, unlike Dataset A's general audience focus"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks in B require utilizing leftover ingredients in recipes, while A does not",
        "B includes tasks focused on budget-friendly meal options, which A does not emphasize",
        "B tasks involve requesting or providing community-driven recipe substitutions/modifications, unlike A",
        "B requires interaction with niche dietary preferences (e.g. keto) beyond A's standard categories",
        "B tasks specifically target international cuisine recipes (e.g. Korean BBQ, Thai), while A focuses on broader categories",
        "B includes occasion-specific recipes for non-holiday events (e.g. New Year's Eve parties), whereas A focuses on major holidays",
        "B tasks emphasize meal prep planning and storage strategies not present in A",
        "B requires creating/applying decorative presentation elements (e.g. garnishes), which A does not address",
        "B tasks involve detailed review authoring with personal adaptation suggestions, while A focuses on review parsing",
        "B includes requests for beginner-friendly cooking instructions, which A does not explicitly require"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Dataset B tasks require identifying synonyms and antonyms of queried words, while Dataset A does not explicitly involve antonym/synonym exploration.",
        "Tasks in Dataset B involve exploring word etymologies, which are not mentioned in Dataset A's tasks.",
        "Dataset B includes tasks that demand counting syllables in words, a feature not present in Dataset A's requirements.",
        "Dataset B tasks require comparing word relationships (e.g., 'relation with \"meeting\"'), while Dataset A focuses on isolated word analysis.",
        "Dataset B explicitly involves translating full phrases (e.g., 'How are you?'), whereas Dataset A only references single-word translations.",
        "Dataset B tasks include providing feedback on example sentences, an interaction not required in Dataset A.",
        "Dataset B requires exploration of business-specific vocabulary (e.g., 'business professional'), while Dataset A focuses on general vocabulary.",
        "Tasks in Dataset B involve verifying word usage in specific phrases (e.g., 'bit by bit'), whereas Dataset A focuses on standalone example sentences.",
        "Dataset B includes tasks analyzing word popularity through 'Popular searches' lists, absent in Dataset A's requirements.",
        "Dataset B tasks reference explicit privacy choices ('Do Not Sell My Personal Information'), while Dataset A only involves basic cookie consent management."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Dataset B tasks require identifying functional language usage (e.g., greetings/farewells) rather than isolated word lookups",
        "Dataset B includes tasks involving collocation identification (e.g., 'affordable accommodation') not explicitly required in A",
        "Dataset B contains tasks about grammatical concept relationships (e.g., market research vs advertising) absent in A",
        "Dataset B requires exploring grammatical forms (e.g., basic verb forms) rather than specific grammar rules",
        "Dataset B tasks demand phrase-level analysis (e.g., 'in a nutshell') where A focuses on single-word definitions",
        "Dataset B includes broader vocabulary exploration tasks (e.g., 'words related to psychology') unlike A's specific word queries",
        "Dataset B contains explicit translation direction tasks (e.g., English\u2192Spanish days) requiring bidirectional understanding",
        "Dataset B requires handling conceptual relationships between terms (e.g., terminology\u2192linguistics) not present in A",
        "Dataset B tasks involve error recovery during navigation (e.g., 'handle errors') absent in A's linear tasks",
        "Dataset B includes antonym identification for phrases (e.g., luck-related terms) where A focuses on word synonyms"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Dataset B tasks require interaction with Spanish translations (English\u2013Spanish/Spanish\u2013English) while Dataset A focuses on Chinese, French, and German translations.",
        "Dataset B includes tasks involving 'Popular searches' lists (e.g., 'work', 'schedule', 'statuette') for common word exploration, absent in Dataset A.",
        "Dataset B tasks reference future dates in blog content (e.g., December 2024), while Dataset A uses future dates (March/April 2025).",
        "Dataset B tasks require engagement with privacy consent banners (e.g., 'Do Not Sell My Personal Information'), not present in Dataset A.",
        "Dataset B tasks involve exploring phrases like 'bit by bit' in blog sections, whereas Dataset A focuses on topics like 'low prices' in blog content.",
        "Dataset B includes tasks about psychological terms (e.g., 'glossophobia') and teaching methods, while Dataset A tasks focus on general vocabulary and grammar rules.",
        "Dataset B's Word of the Day example ('box-office') differs semantically from Dataset A's example ('one-size-fits-all'), reflecting distinct lexical priorities.",
        "Dataset B tasks emphasize comparing translations between Traditional and Simplified Chinese (e.g., 'hello'), while Dataset A uses Chinese translations without explicit comparison directives.",
        "Dataset B includes tasks requiring exploration of website features (e.g., 'Learn how to use the dictionary translation'), whereas Dataset A assumes prior familiarity with navigation.",
        "Dataset B tasks reference preparing teaching materials (e.g., 'teaching methods to prepare teaching materials'), absent in Dataset A's task scope."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Dataset B tasks frequently require handling multiple words/phrases in single queries (e.g. 'apple and Friday') while A focuses on single-word queries",
        "Dataset B includes abstract conceptual tasks (e.g. 'find a word capturing the feeling of dawn') requiring interpretation, unlike A's concrete definition lookups",
        "Dataset B contains technical implementation tasks (e.g. 'find the code for solve') absent in A's purely linguistic focus",
        "Dataset B features interactive quiz completion tasks (e.g. 'animal-related quiz') not present in A's game-focused activities",
        "Dataset B emphasizes antonym exploration and emotional connotation analysis more than A's synonym-focused tasks",
        "Dataset B tasks require broader grammar concept exploration (e.g. 'different aspects of verbs') vs A's specific grammar rule lookups",
        "Dataset B includes cross-website research instructions (e.g. 'using search engine') unlike A's strictly site-contained tasks",
        "Dataset B tasks more frequently involve phrase-level analysis (e.g. 'in a nutshell') compared to A's single-word focus",
        "Dataset B requires explicit content sharing actions (e.g. 'share on Twitter') while A implies passive social media access",
        "Dataset B contains meta-navigation tasks (e.g. 'explore website features') absent in A's direct information retrieval focus"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Dataset B includes tasks requiring exploration of vocabulary through inferred context and linked terms (e.g., \"trajectory... navigating to vocabulary-related web pages\"), whereas Dataset A focuses on direct lookups.",
        "Dataset B tasks involve translating phrases (e.g., \"break a leg\" to Portuguese) beyond simple word translations, while Dataset A focuses on direct word/phrase translations between languages.",
        "Dataset B tasks explicitly require understanding parts of speech distinctions (e.g., \"adverb phrases vs adjective phrases\"), which are not specified in Dataset A.",
        "Dataset B includes tasks about sharing definitions (e.g., \"Share the definition of 'jukebox'\"), a feature absent in Dataset A tasks.",
        "Dataset B tasks involve open-ended exploration of dictionary features (e.g., \"Explore the Cambridge Dictionary's features\"), while Dataset A tasks are strictly goal-oriented.",
        "Dataset B tasks reference annual events like the \"word of the year\" (e.g., \"2024 word of the year\"), which Dataset A does not mention.",
        "Dataset B tasks require identifying collocations (e.g., \"collocations related to accommodation\"), whereas Dataset A focuses on synonyms/antonyms.",
        "Dataset B includes vague, multi-step navigation (e.g., \"inferring terms based on actions\"), while Dataset A tasks are granular and linear.",
        "Dataset B tasks involve Spanish and Dutch translations (e.g., \"Translate 'hello' to Dutch\"), whereas Dataset A focuses on Chinese and French.",
        "Dataset B tasks require resolving ambiguities (e.g., \"easily confused word pairs\"), unlike Dataset A\u2019s explicit grammar rule lookups."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Dataset B tasks require device customization processes (e.g., creating Apple Watch combinations)",
        "Dataset B includes navigation through enterprise/business account management interfaces",
        "Dataset B tasks involve locating healthcare-specific product applications and documentation",
        "Dataset B requires interaction with environmental impact reports and carbon neutrality initiatives",
        "Dataset B contains tasks needing navigation through app-specific update histories and award information",
        "Dataset B includes troubleshooting scenarios for device functionality issues (e.g., battery life)",
        "Dataset B tasks require comparison of accessory combinations with specific device models",
        "Dataset B involves navigation through family sharing setup and management interfaces",
        "Dataset B contains tasks requiring analysis of device repair/maintenance documentation",
        "Dataset B includes explicit privacy data usage exploration beyond basic security features"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Dataset B tasks focus on environmental impact and sustainability information for products",
        "Dataset B requires accessing device management features like battery optimization and performance tips",
        "Dataset B includes tasks related to family sharing setup and group management",
        "Dataset B contains navigation requirements for warranty status checks and repair options",
        "Dataset B involves comparing products through enterprise/business purchasing programs",
        "Dataset B tasks require accessing data privacy and security policy information",
        "Dataset B emphasizes post-purchase support and service management features",
        "Dataset B includes health/fitness metric analysis through device integrations",
        "Dataset B tasks require navigation through enterprise/business solution sections",
        "Dataset B contains product configuration tasks with bundled service subscriptions"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Dataset A tasks require precise calculation of price differences between base and maximum configurations (e.g., storage upgrades), while Dataset B focuses on general price retrieval without granular configuration comparisons",
        "Dataset B includes explicit tasks related to business use cases (e.g., 'examples of businesses using Apple technology'), while Dataset A focuses exclusively on consumer-focused navigation",
        "Dataset A requires identification of specific technical specifications (e.g., processor types, wireless pairing features), while Dataset B emphasizes broader product comparisons and feature overviews",
        "Dataset B contains tasks involving educational institution purchases and academic discounts (e.g., 'price for University of Alabama'), while Dataset A focuses on individual consumer education pricing",
        "Dataset A tasks demand exact accessory compatibility verification (e.g., '5 Built-in Apps supported'), while Dataset B focuses on general accessory purchasing without technical validation",
        "Dataset B includes healthcare-specific product inquiries and achievements tracking, which are absent from Dataset A's consumer-focused tasks",
        "Dataset A requires zip code-specific in-store availability checks, while Dataset B tasks only involve general store location finding",
        "Dataset B contains financial reporting tasks (e.g., 'quarterly earnings report'), a category completely absent from Dataset A",
        "Dataset A focuses on environmental feature cross-referencing with product specs, while Dataset B emphasizes standalone environmental report retrieval",
        "Dataset B includes device repair cost inquiries and parental control setup tasks, expanding beyond Dataset A's new purchase-focused support requests"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks require accessing enterprise/business-specific sections (business plans, enterprise device management) not present in Dataset A",
        "Dataset B contains queries about Family Sharing configuration and 'Ask to Buy' features absent in Dataset A",
        "Dataset B includes financial transaction tasks requiring navigation to investor relations/financial reports (quarterly results) not seen in Dataset A",
        "Dataset B tasks demand interaction with environmental impact reports/sustainability documentation unavailable in Dataset A",
        "Dataset B requires precise device configuration specifications (e.g. 6.3-inch display, Jet Black Aluminum) beyond Dataset A's generic customization",
        "Dataset B contains explicit troubleshooting scenarios (keyboard liquid damage, battery repair costs) not present in Dataset A",
        "Dataset B tasks involve purchasing specific accessories (cases, HomePod) rather than just comparing included accessories as in Dataset A",
        "Dataset B includes business-oriented queries (Apple Business Essentials pricing, enterprise purchasing) absent from Dataset A's consumer focus",
        "Dataset B requires navigation through privacy/data usage documentation (user tracking policies) not needed in Dataset A tasks",
        "Dataset B tasks involve retail operations navigation (store hours lookup, bulk purchasing) not present in Dataset A"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks in B require direct purchase configuration actions (selecting models/storage/carriers)",
        "B includes warranty status checks and AppleCare-related queries as core tasks",
        "B tasks focus on business/commercial purchasing options and enterprise solutions",
        "B requires accessing specific repair/troubleshooting documentation for hardware issues",
        "B contains app-specific research tasks (version history, reviews of third-party apps)",
        "B emphasizes accessory compatibility checks (case colors/device-specific accessories)",
        "B includes device integration queries (iPhone-Watch connectivity/feature synergy)",
        "B tasks require navigating privacy/security settings and data handling explanations",
        "B contains battery optimization/maintenance tasks across multiple device types",
        "B requires finding technical specifications for specific repair procedures"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks in dataset B more frequently involve exploratory research without a predefined specific answer (e.g., 'research climate change causes')",
        "Dataset B contains more explicit requests for instructional/educational content (e.g., 'find beginner tutorials') compared to dataset A's focus on factual lookup",
        "Dataset B includes tasks requiring interaction with specialized professional resources (e.g., job listings, business tools) not present in A",
        "Tasks in B more commonly involve personal scenario planning (e.g., event venues, parenting advice) rather than pure factual retrieval",
        "Dataset B shows increased emphasis on comparative shopping tasks (e.g., product price comparisons) not seen in A",
        "B contains tasks requiring navigation through health/medical information resources (e.g., symptom checking) absent from A",
        "Dataset B includes explicit requests to interact with account/services management features (e.g., scheduling meetings with specialists)",
        "Tasks in B more frequently require understanding of professional/commercial terminology (e.g., 'financials', 'tech specs') compared to A's general factual terms",
        "Dataset B contains tasks involving content contribution/editing (e.g., Wikipedia edits) not present in A's observation-only tasks",
        "B shows increased emphasis on location-based personalization (e.g., 'near me' requirements) compared to A's universal queries"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks involve recipe discovery and meal planning (e.g., smoothie recipes, ingredient lists)",
        "Dataset B includes explicit requests for future-oriented information (2025 movie trailers, SEO trends for 2025)",
        "Dataset B tasks focus on educational/career development (language learning, SEO strategies, job applications)",
        "Dataset B contains event planning objectives (venue research, ticket purchases, amenities comparison)",
        "Dataset B emphasizes practical daily life applications (parenting advice, woodworking projects, health management)",
        "Dataset B requires interaction with self-improvement/learning platforms (Duolingo courses)",
        "Dataset B includes tasks about understanding platform mechanics (Google's security features, autocomplete functionality)",
        "Dataset B features citation/style-specific academic requests (AMA format references)",
        "Dataset B tasks focus on corporate/technology analysis (Google AI applications, stock comparisons)",
        "Dataset B contains explicit environmental/social responsibility queries (climate action, sustainability initiatives)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset A tasks prioritize retrieval of specific numerical/chronological facts (elevations, dates, statistics), while B focuses on conceptual understanding (AI innovations, climate effects, academic programs)",
        "Dataset A contains more real-time verification tasks (login success confirmation, live scores), whereas B emphasizes exploratory information gathering (research trends, program details)",
        "Transactional tasks in A involve basic e-commerce actions (purchases, reservations), while B includes professional interactions (job applications, corporate event planning)",
        "Dataset A requires extraction of discrete data points from structured layouts (sports stats, comment metadata), while B demands synthesis of information from unstructured prose (research papers, articles)",
        "Domain focus in A centers on entertainment/sports/personal metrics, whereas B emphasizes academic/technical/professional development content",
        "Dataset B includes tasks requiring comparative analysis (stock comparisons, trend evaluations) absent in A's direct fact retrieval",
        "B contains contributory tasks (Wikipedia editing, recipe database additions) not present in A's consumption-oriented workflows",
        "Dataset A's health-related tasks focus on environmental metrics (air quality), while B addresses medical information (vaccine details, infection prevention)",
        "B features skill/knowledge acquisition tasks (tutorials, recipe finding) contrasting with A's transient information needs",
        "Dataset B includes explicit career development components (job searches, professional research) absent in A's personal/non-vocational focus"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Tasks in B involve booking reservations or purchasing products (e.g., hotels, event venues, benches), whereas A focuses on non-transactional data retrieval.",
        "B includes tasks requiring multi-criteria comparisons (e.g., hotel ratings, travel destinations), while A emphasizes single-value extraction (e.g., dates, statistics).",
        "B contains health/medical inquiries (e.g., flu symptoms, diabetes treatments) absent in A.",
        "B involves open-ended research for inspiration/ideas (e.g., event planning, woodworking projects), whereas A prioritizes definitive answers.",
        "B includes career/job-related tasks (e.g., software engineering job searches), which A lacks.",
        "B requires product configuration (e.g., customizing Pixel phones), while A focuses on static specifications.",
        "B features lifestyle/practical tasks (e.g., recipes, parenting tips) not present in A.",
        "B includes translation/localization tasks (e.g., French-to-English translation), absent in A.",
        "B tasks often involve temporal constraints (e.g., 'jobs posted in last 3 days'), while A emphasizes real-time data without date-bound filtering.",
        "B targets venue/activity availability checks (e.g., museum tickets, recreation centers), whereas A focuses on universally accessible facts."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Dataset A tasks predominantly require extracting exact numerical/metric values (e.g., elevation, counts, dates), while Dataset B focuses on retrieving general informational content (e.g., definitions, advice, event ideas).",
        "Dataset B includes tasks involving purchasing/transactional interactions (e.g., ticket purchases, hotel bookings), absent in Dataset A.",
        "Dataset A tasks frequently involve verifying account credentials or login success, whereas Dataset B lacks such authentication-focused tasks.",
        "Dataset B tasks emphasize comparing product features or prices (e.g., Apple Watch models, stocks), while Dataset A comparisons focus on rankings or aggregated data (e.g., prices, rankings).",
        "Dataset A tasks prioritize real-time or dynamic data retrieval (e.g., scores, pollution levels), whereas Dataset B includes static or procedural information (e.g., program details, health guidelines).",
        "Dataset B tasks often require navigation to specific institutional websites (e.g., universities, CDC), while Dataset A relies more on search engine results for information.",
        "Dataset A tasks demand parsing structured multimedia content (e.g., YouTube comments, GitHub commits), whereas Dataset B focuses on parsing articles, reviews, or guides.",
        "Dataset B includes tasks related to managing settings or configurations (e.g., Chrome translation settings, search history), absent in Dataset A.",
        "Dataset B tasks involve retrieving health/medical guidance (e.g., pink eye causes, blood pressure foods), which are not present in Dataset A.",
        "Dataset A tasks target discrete entities (e.g., people, events, movies), while Dataset B emphasizes conceptual understanding (e.g., AI principles, machine learning definitions)."
      ]
    }
  },
  "diffs_real_from_synth": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Dataset B tasks explicitly require analyzing user review content (e.g., reading specific star-level reviews) while A focuses on review availability/summaries",
        "Dataset B includes counting/quantification requirements (e.g., 'how many results') not present in A's comparison-focused tasks",
        "Dataset B contains specific parking type requests (motorcycle, EV charging, daytime-only) as distinct search criteria",
        "Dataset B requires detailed route step documentation (e.g., 'provide detailed route information') rather than general navigation",
        "Dataset B tasks demand URL/link generation for sharing locations (e.g., 'generated sharing link')",
        "Dataset B includes chain business location queries (e.g., 'all Uniqlo locations') as explicit search targets",
        "Dataset B requires proximity analysis to transportation hubs (e.g., airports) beyond A's general geographic markers",
        "Dataset B tasks involve multi-instruction sequences (e.g., 'first search X then do Y') as explicit workflow requirements",
        "Dataset B includes structural analysis of business information (e.g., 'which level has...') beyond attribute extraction",
        "Dataset B specifies vehicle-specific requirements (bicycle/motorcycle parking) as distinct from general accessibility"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset B tasks frequently specify exact business chains (e.g., Target, Apple Stores) while Dataset A uses general business categories.",
        "Dataset B includes tasks requiring identification of specific parking types (e.g., motorcycle, EV charging) not present in Dataset A.",
        "Dataset B tasks utilize exclusionary filters (e.g., 'not open 24 hours') whereas Dataset A focuses on inclusionary criteria like 'open now'.",
        "Dataset B tasks demand sharing map links or generated URLs, a feature absent in Dataset A tasks.",
        "Dataset B requires summarizing user review sentiments while Dataset A focuses on extracting specific review content.",
        "Dataset B tasks explicitly request counting exact numbers of search results, whereas Dataset A's quantitative tasks involve listing a specified number.",
        "Dataset B includes transportation-specific queries (e.g., bus stops, bicycle parking) while Dataset A refers to general transit options.",
        "Dataset B tasks combine location searches with subsequent logistical actions (e.g., find hotel then supermarket), unlike Dataset A's sequential multi-steps.",
        "Dataset B tasks require detailed route step instructions, while Dataset A's direction requests are more general.",
        "Dataset B tasks focus on specific amenity combinations (e.g., EV charging near museums) whereas Dataset A's criteria are broader."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset B tasks require explicit analysis of user review content (e.g., 'check what a one-star review says') while Dataset A focuses only on rating thresholds",
        "Dataset B contains tasks requiring quantification of results (e.g., 'how many results are shown') that don't appear in Dataset A",
        "Dataset B includes specific operational hour exceptions (e.g., 'not open 24 hours') as filters, while Dataset A uses basic 'open now' constraints",
        "Dataset B tasks require map sharing functionality (e.g., 'find the way to share the map') absent in Dataset A",
        "Dataset A contains reservation-making requirements that never appear in Dataset B tasks",
        "Dataset B specifies exact numerical rating thresholds (e.g., '4.6 or higher') while Dataset A uses relative descriptors like 'highly-rated'",
        "Dataset B tasks require analysis of location-specific infrastructure layers (e.g., 'which level has...') not present in Dataset A",
        "Dataset B includes explicit quantity requirements (e.g., 'list three of them', 'find 5 beauty salons') absent in Dataset A",
        "Dataset B tasks demand landmark-specific parking types (e.g., 'motorcycle parking', 'EV charging') while Dataset A uses general accessibility filters",
        "Dataset B contains tasks requiring summarization of crowd-sourced information (e.g., 'summarize what people are saying') not seen in Dataset A"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Tasks in dataset B require analyzing specific aspects of user reviews (e.g., reading one-star review content) rather than just overall ratings",
        "Dataset B tasks explicitly demand counting/number quantification of results (e.g., 'how many results are shown')",
        "Dataset B includes tasks requiring summarization/paraphrasing of user review content rather than simple retrieval",
        "Dataset B contains tasks with exclusion criteria filters (e.g., 'not open 24 hours') not just inclusion criteria",
        "Dataset B requires detailed breakdowns of route components/steps rather than just generating directions",
        "Dataset B tasks specify proximity measurements using exact distance units (e.g., 'within 2 miles') more frequently",
        "Dataset B emphasizes parking-specific navigation (different types: motorcycle, bicycle, daytime-only) as distinct category",
        "Dataset B includes map sharing/link generation tasks as explicit action requirements",
        "Dataset B tasks require comparing sub-components of locations (e.g., 'which level has least proportion in reviews')",
        "Dataset B contains multi-location coordination tasks requiring time calculations (e.g., walking time between points)"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Dataset A tasks frequently involve reservation/bookings with specific dates/times (e.g., hotel bookings, restaurant reservations)",
        "Dataset B tasks more commonly request analysis of review contents (e.g., 'check what one-star reviews say', 'summarize what people are saying')",
        "Dataset B contains explicit requirements for quantitative reporting (e.g., 'how many results', 'list three of them') not seen in A",
        "Dataset B includes specific parking type queries (motorcycle, EV charging, daytime-only) not present in A",
        "Dataset A tasks emphasize multi-stop route planning with intermediate points, while B focuses on point-to-point navigation",
        "Dataset B contains explicit map interaction requirements (e.g., 'share the map', 'generated sharing link') absent in A",
        "Dataset B tasks more frequently use exclusionary filters ('not open 24 hours') compared to A's additive filters",
        "Dataset A includes accessibility evaluation as part of route planning, while B focuses on accessibility as static location attribute",
        "Dataset B shows stronger emphasis on chain/store location queries (e.g., 'Uniqlo locations', 'Target stores')",
        "Dataset A contains more temporal planning elements (future dates, itinerary creation) compared to B's immediate needs"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in B require real-time or recent data validation (e.g., repositories updated within the last 2 days, latest commits).",
        "Tasks in B involve precise quantitative filters (e.g., stars \u2265500, contributors \u22653), while A uses broader criteria.",
        "Tasks in B demand extraction of specific commit details (e.g., changed files, additions/deletions) rather than general history.",
        "Tasks in B include verifying account existence via specific email inputs, absent in A.",
        "Tasks in B require listing exact feature counts (e.g., '3 features in Copilot'), whereas A focuses on qualitative comparisons.",
        "Tasks in B explicitly reference trending repositories with rankings (e.g., 'ranked first this month'), unlike A.",
        "Tasks in B mandate identification of project purposes (e.g., 'main purpose of C++ project'), while A focuses on feature discovery.",
        "Tasks in B involve checking release metadata (e.g., stable version publish dates), absent in A's vulnerability-focused tasks.",
        "Tasks in B require listing customer stories visible on the page (e.g., '2 stories'), while A focuses on locating sections.",
        "Tasks in B prioritize contributor metrics (e.g., 'top three contributors'), whereas A emphasizes contribution processes."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Dataset B tasks require verifying specific user account status (e.g., email existence check) while A focuses on general feature exploration",
        "Dataset B tasks demand precise identification of repository contribution metrics (e.g., top contributors, commit statistics) where A focuses on general contributor information",
        "Dataset B requires strict time-bound filtering (e.g., 'last 2 days', 'past 30 days') while A uses broader recency parameters",
        "Dataset B tasks involve direct repository file inspection (e.g., changed files in commits) while A focuses on metadata overview",
        "Dataset B requires quantitative star-based filtering (e.g., 'at least 500 stars') while A uses relative popularity measures",
        "Dataset B tasks involve ranking/prioritization (e.g., 'most popular', 'top three') while A focuses on existence verification",
        "Dataset B requires brief project description generation from repository data while A focuses on policy/documentation retrieval",
        "Dataset B tasks demand specific version/release information (e.g., stable release dates) while A focuses on general product versions",
        "Dataset B includes explicit comparisons between numerical plan limits (e.g., private repo counts) while A compares plan features qualitatively",
        "Dataset B requires identification of trending/ranked content (e.g., 'currently ranked first') while A focuses on general customer stories"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks in dataset B require real-time filtering of repositories by specific timeframes (e.g., 'last 2 days', 'past 30 days') rather than general update date ranges.",
        "Dataset B tasks emphasize immediate identification of numerical thresholds (e.g., 'at least 500 stars', 'over 100 stars') in search criteria.",
        "Dataset B includes explicit requirements to extract quantitative metadata (e.g., 'total additions and deletions', 'number of courses') from identified resources.",
        "Tasks in dataset B focus on transient platform sections (Trending, Customer Stories) rather than permanent product documentation.",
        "Dataset B requires direct extraction of contributor rankings (e.g., 'top three contributors', 'ranked first this month') from repository data.",
        "Tasks in dataset B demand identification of specific file-level changes in commit histories rather than general commit history analysis.",
        "Dataset B emphasizes identification of trending/realtime leaderboard data (e.g., 'currently ranked first this month') in platform sections.",
        "Tasks in dataset B require verification of exact plan limitations (e.g., 'maximum number of private repositories') during comparisons.",
        "Dataset B includes concrete validation tasks (e.g., 'see if email exists') within authentication workflows rather than general sign-up processes.",
        "Tasks in dataset B specify exact technical domain filters (e.g., 'tagged with web scraping', 'quantum computing') rather than general language-based filtering."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Dataset B tasks require immediate interaction with sign-up forms to verify email existence or account status.",
        "Dataset B tasks involve retrieving specific quantitative repository metadata (e.g., exact star counts, contributor rankings).",
        "Dataset B requires filtering repositories using precise time constraints (e.g., 'updated within last 2 days').",
        "Dataset B tasks demand identification of exact file changes in commit histories or release details.",
        "Dataset B includes explicit requests for numerical comparisons between plans (e.g., maximum private repositories in Free vs Pro).",
        "Dataset B tasks require extraction of ranked lists (e.g., 'top three contributors', 'most popular repo by stars').",
        "Dataset B focuses on real-time validation of repository attributes (e.g., freshness of updates, trending status).",
        "Dataset B tasks specify language + technology stack combinations in search criteria (e.g., 'Python + decision trees').",
        "Dataset B requires identification of exact feature lists from product pages (e.g., '3 Copilot features').",
        "Dataset B tasks involve concrete thresholds for popularity metrics (e.g., 'at least 500 stars')."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Tasks in B require verifying existing account status (e.g., email availability checks) during sign-up flows",
        "B tasks involve extracting exact commit metadata (e.g., changed files, additions/deletions) from specific repositories",
        "B requires handling precise numerical filters (e.g., '500 stars', 'last 2 days') in repository searches",
        "B tasks demand reporting ranked/quantified results (e.g., 'top three contributors', 'most popular repo')",
        "B focuses on identifying official repositories for known projects (e.g., TensorFlow, ALBERT)",
        "B includes explicit requirements to analyze GitHub Trending rankings and developer leaderboards",
        "Tasks in B require extracting enumerated feature lists from product pages (e.g., 'list 3 features')",
        "B tasks involve comparing exact numerical plan limits (e.g., 'maximum private repositories') between tiers",
        "B requires locating and quantifying educational resources (e.g., course counts in GitHub Skills)",
        "Tasks in B specify searching repositories using exact tags/topics (e.g., 'web scraping', 'climate change')"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Dataset A tasks emphasize real-time game updates and live score tracking, while B focuses on post-game summaries and final results.",
        "Dataset A includes tasks requiring navigation through multiple concurrent leagues (e.g., NBA/NHL/MLB), while B emphasizes single-league deep dives.",
        "Dataset B tasks frequently involve accessing historical/season-long standings (e.g., conference rankings), while A focuses on immediate standings.",
        "Dataset A requires checking injury reports across multiple teams simultaneously, while B targets single-team injury status lookups.",
        "Dataset B tasks require identification of conference/division structures (e.g., NFC North composition), absent in A.",
        "Dataset A involves direct comparison of betting odds across games, while B omits explicit betting odds navigation.",
        "Dataset B includes tasks requiring navigation to collegiate recruiting information, not present in A.",
        "Dataset A tasks require differentiation between game states (live/final/upcoming), while B assumes completed events.",
        "Dataset B emphasizes accessing team salary cap/roster financial data, not required in A.",
        "Dataset A requires simultaneous tracking of player statistics across multiple sports, while B focuses on single-sport athlete analysis"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Dataset B tasks emphasize NCAA sports (basketball, football) more prominently than dataset A",
        "Dataset B requires accessing ESPN+ specific content and tools, which is absent in dataset A",
        "Dataset A includes tasks related to historical data analysis (e.g., March Madness history) not present in B",
        "Dataset B focuses more on current/live game results while A includes future event planning (bowl schedules)",
        "Dataset A contains tasks involving cross-sport comparisons (EPL to NBA trades) not found in B",
        "Dataset B requires identification of conference/division compositions (NFC North teams) unlike A",
        "Dataset A includes multimedia navigation (podcasts, radio) while B focuses on written content/articles",
        "Dataset B tasks specifically request salary information and roster details absent in A",
        "Dataset A contains more fantasy sports integration tasks compared to B",
        "Dataset B emphasizes player recruitment tracking (NCAAW recruiting) not present in A"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset A tasks focus on historical/seasonal data retrieval (e.g., 'specific seasons'), while B emphasizes current/recent information (e.g., 'latest NBA game', 'past 2 days')",
        "Dataset B requires identification of teams by specific attributes (e.g., 'Golden' in name), unlike A which uses direct team names for navigation",
        "Dataset A contains more multi-step hierarchical navigation (league\u2192team\u2192game), while B has more direct single-page lookups for scores/results",
        "Dataset B tasks involve player contract/salary information retrieval (e.g., 'highest salary'), which aren't present in A's requirements",
        "Dataset A includes explicit cross-sport comparisons, while B tasks remain sport-specific in navigation requirements",
        "Dataset B requires identification of conference/division structures (e.g., NFC North teams), unlike A's standalone team navigation",
        "Dataset A tasks focus on game progression details (quarters/periods), while B emphasizes final outcomes and summaries",
        "Dataset B includes explicit requests for content highlights/game summaries, not present in A's score/stats-focused tasks",
        "Dataset A requires navigation through chronological filters (specific dates), while B uses relative timeframes (last 5 games)",
        "Dataset B tasks involve roster/recruitment analysis (college recruiting rankings), absent from A's player stat-focused requirements"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Dataset B tasks require accessing ESPN+ specific content and tools, while Dataset A does not mention ESPN+ features.",
        "Dataset B includes navigation to detailed team standings with win-loss records in specific conferences (e.g., NFC North), absent in Dataset A.",
        "Dataset B tasks involve retrieving historical game highlights (e.g., \"95-yard TD\"), while Dataset A focuses on live/final scores without highlight summaries.",
        "Dataset B requires identifying NFL division compositions (e.g., NFC North teams), not seen in Dataset A tasks.",
        "Dataset B tasks demand player-specific game logs (e.g., \"Lionel Messi's last 5 games\"), whereas Dataset A focuses on aggregate team/player stats.",
        "Dataset B includes queries for NBA Basketball Power Index rankings, absent in Dataset A.",
        "Dataset B tasks require locating injury reports (e.g., \"Philadelphia 76ers' latest injuries\"), not mentioned in Dataset A.",
        "Dataset B involves verifying college recruitment data (e.g., \"NCAAW recruiting top players\"), which Dataset A does not cover.",
        "Dataset B tasks ask for granular NBA salary information (e.g., \"highest salary in Boston Celtics\"), while Dataset A focuses on general player salary stats.",
        "Dataset B requires filtering results by team name keywords (e.g., \"Golden\" in NHL team names), a specificity not present in Dataset A."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Tasks in Dataset B require retrieving scores and statistics from specific dates (e.g., December 25, 2023), while Dataset A focuses on real-time or recent results without explicit date constraints.",
        "Dataset B includes tasks targeting conference-specific team composition (e.g., NFC North teams), while Dataset A emphasizes conference-specific data for standings/rankings.",
        "Dataset B tasks involve retrieving salary information for players (e.g., highest-paid Celtics player), which Dataset A does not include.",
        "Dataset B queries explicitly request comparisons of team names (e.g., teams with 'Golden' in their name), absent in Dataset A.",
        "Dataset B requires identifying positional rankings (e.g., top scorer in a game), whereas Dataset A focuses on general player performance metrics.",
        "Dataset B tasks reference league-specific analytical tools (e.g., NBA Basketball Power Index), not mentioned in Dataset A.",
        "Dataset B includes NCAAW recruiting inquiries (e.g., top recruits\u2019 colleges), while Dataset A lacks recruiting-related tasks.",
        "Dataset B tasks demand historical player/team performance summaries (e.g., Messi\u2019s last 5 games), whereas Dataset A emphasizes current/recent data.",
        "Dataset B requires filtering results by conference parity (e.g., teams with equal wins/losses), absent in Dataset A\u2019s filtering criteria.",
        "Dataset B tasks explicitly ask for playoff/championship bracket details (e.g., CFP Bracket), while Dataset A focuses on tournament challenge interactions."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Dataset B tasks require real-time or current data analysis (e.g., 'latest', 'as of today's date') while Dataset A focuses on static metadata extraction",
        "Dataset B tasks involve direct API interaction for content generation (e.g., story generation) whereas Dataset A focuses on API documentation lookup",
        "Dataset B requires identification of 'most downloaded' rankings across resource types, while Dataset A focuses on basic popularity metrics without ranking comparisons",
        "Dataset B tasks include financial considerations (e.g., Pro account pricing) not present in Dataset A",
        "Dataset B emphasizes temporal recency constraints (e.g., 'released in past month') more strictly than Dataset A's general recency filters",
        "Dataset B requires synthesis of multiple performance metrics (e.g., 'latest performance metrics and usage guidelines') while Dataset A focuses on singular metric extraction",
        "Dataset B tasks demand application-oriented model evaluation (e.g., 'detecting fake news') compared to Dataset A's technical specification comparisons",
        "Dataset B includes explicit versioning requirements (e.g., 'released in the past month') absent in Dataset A tasks",
        "Dataset B tasks require cross-modal analysis (e.g., connecting Text-to-3D models with Spaces implementations) beyond Dataset A's single-resource focus",
        "Dataset B emphasizes documentation exploration for implementation details (e.g., Trainer API parameters) while Dataset A focuses on basic documentation location"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks emphasize real-time/current data retrieval (e.g., 'most downloaded currently') while Dataset A focuses on static metadata retrieval",
        "Dataset B requires direct API interaction for content generation (e.g., 'generate a short story') unlike Dataset A's focus on API documentation lookup",
        "Dataset B tasks require summarizing technical documentation sections while Dataset A focuses on locating specific documentation entries",
        "Dataset B emphasizes identification of newest resources (e.g., 'released in past month') compared to Dataset A's focus on general trending/recent resources",
        "Dataset B includes verification of model implementations in Spaces applications, unlike Dataset A",
        "Dataset B tasks specifically request numerical popularity metrics (e.g., 'highest number of downloads') while Dataset A focuses on general popularity indicators",
        "Dataset B requires exploration of blog content analysis while Dataset A focuses on research paper cross-referencing",
        "Dataset B tasks involve parameter configuration analysis in Trainer API while Dataset A focuses on basic API integration examples",
        "Dataset B includes specific modality combinations (e.g., text-to-3D) not present in Dataset A's task requirements",
        "Dataset B emphasizes temporal recency constraints (e.g., 'within March 2023') more strictly than Dataset A's relative time references"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset A tasks predominantly involve searching for specific named models/datasets (e.g., 'Meta-Llama-3-8B') while Dataset B focuses on abstract category searches (e.g., 'audio-related datasets')",
        "Dataset B tasks emphasize real-time popularity metrics (e.g., 'current most downloaded', 'as of today's date') whereas Dataset A uses historical/static popularity indicators",
        "Dataset A contains tasks requiring multilingual documentation access (e.g., 'German installation instructions') not present in Dataset B",
        "Dataset B tasks require direct API interaction through web interfaces (e.g., 'use Inference API on webpage') unlike Dataset A's API endpoint identification tasks",
        "Dataset A includes troubleshooting scenarios (e.g., 'resolve task not found error') absent in Dataset B's tasks",
        "Dataset B emphasizes newer/open-source project exploration (e.g., 'latest open-source NLP models') more than Dataset A",
        "Dataset A tasks require academic paper retrieval while Dataset B focuses on toolkit capability summaries (e.g., 'Text Embeddings Inference strengths')",
        "Dataset B contains explicit pricing/account tier inquiries (e.g., 'Pro account cost') not found in Dataset A",
        "Dataset A tasks involve cross-referencing implementation details (e.g., 'Trainer API parameters') while Dataset B focuses on end-user functionality",
        "Dataset B requires temporal recency validation (e.g., 'last update within March 2023') as core task component unlike Dataset A's general recency mentions"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks emphasize real-time/current metrics (e.g., 'most downloaded currently', 'latest as of today') while A focuses on historical/versioned metadata",
        "Dataset B requires direct API interaction demonstrations (e.g., generating stories, computing sentence similarities) where A focuses on API exploration/configuration",
        "Dataset B contains tasks requiring content summarization (e.g., blog overviews, model functionality descriptions) absent in A's documentation-focused patterns",
        "Dataset B emphasizes popularity metrics (download counts, trending status) as primary filters where A prioritizes technical attributes (modalities, library dependencies)",
        "Dataset B includes explicit price/feature comparisons for account tiers (Pro account details) while A focuses on enterprise deployment costs",
        "Dataset B tasks require identifying models by framework specialization (PaddlePaddle library) where A specifies framework compatibility without library filtering",
        "Dataset B contains educational resource exploration (classroom benefits) not present in A's task patterns",
        "Dataset B requires temporal recency validation ('within March 2023') as hard constraints where A uses relative time filters ('new/recent models')",
        "Dataset B tasks involve multimodal output analysis (text-to-3D models with Spaces integration) where A focuses on input modality specifications",
        "Dataset B requires cross-modal performance comparisons (model downloads vs Spaces usage) where A compares benchmark metrics"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset B tasks emphasize real-time popularity metrics (e.g. 'most downloaded') more prominently than Dataset A",
        "Dataset B contains more explicit requirements to interact with live API endpoints (e.g. generate stories, calculate similarity)",
        "Dataset B tasks focus more heavily on temporal recency constraints (e.g. 'latest', 'released in past month')",
        "Dataset B includes more comparative ranking tasks (e.g. 'highest number of downloads', 'top 3') than Dataset A",
        "Dataset B requires more interpretation of model application scenarios (e.g. 'detecting fake news', 'recipe generation')",
        "Dataset B tasks more frequently require summarizing technical documentation content",
        "Dataset B contains explicit pricing/account feature investigation tasks absent in Dataset A",
        "Dataset B emphasizes cross-modal combinations (e.g. text-to-3D, audio datasets) more than Dataset A",
        "Dataset B tasks require more precise versioning awareness (e.g. 'March 2023', 'as of today's date')",
        "Dataset B includes more language pair-specific requirements (e.g. English-Chinese translation) than Dataset A"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Dataset B tasks require identifying specific rating thresholds (e.g., 'at least 4.5 stars') and review distributions (e.g., 5-star percentages), while A focuses on general course quality assessment.",
        "Dataset B includes explicit filtering by credit eligibility status (e.g., 'Credit Eligible'), which isn't present in Dataset A tasks.",
        "Dataset B tasks demand analysis of instructor biographies and cross-referencing their other offerings, while A focuses on institutional affiliations.",
        "Dataset B requires identification of regional partner institutions (e.g., Australian partners), whereas A focuses on global partner listings.",
        "Dataset B tasks specify duration ranges (e.g., '1-4 Years') as filter criteria, while A uses more general duration parameters.",
        "Dataset B includes queries about free course language of instruction verification, which isn't explicitly required in A's tasks.",
        "Dataset B tasks require comparison of rating distributions across difficulty levels (e.g., 'which level has least percentage'), while A focuses on single-level course identification.",
        "Dataset B emphasizes identification of specific assessment components (e.g., 'number of quizzes'), whereas A focuses on general course content.",
        "Dataset B tasks require enumeration of bachelor's degree programs, while A focuses on master's and certificate programs.",
        "Dataset B includes analysis of business/team plan features (e.g., 'summarize advantages of Coursera for Teams'), which A doesn't explicitly require."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Tasks in B require numerical analysis of course ratings (e.g., percentage calculations from star ratings)",
        "B includes multi-step filtering requirements combining credit eligibility with duration parameters",
        "B tasks demand summarization of business service advantages beyond course metadata extraction",
        "B requires identification of specific assessment quantities (e.g., number of quizzes in course structure)",
        "B tasks explicitly request learning outcome descriptions from specialization programs",
        "B contains geographical filtering requirements for institutional partners (e.g., Australian universities)",
        "B tasks involve comparative analysis of review distributions across rating tiers",
        "B requires identification of free course tags through interface interaction patterns",
        "B tasks mandate verification of project components within specialization structures",
        "B includes multi-attribute extraction requirements per query (name+institution+skills in single response)"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset B tasks require quantitative analysis of course ratings/reviews (e.g., 4.5+ ratings, star distributions) while A focuses on qualitative metadata extraction",
        "Dataset B includes tasks requiring identification of business/enterprise solutions (Coursera for Teams/Business) not present in A",
        "Dataset B demands analysis of review distributions (percentage breakdown of star ratings) while A only requires basic review awareness",
        "Dataset B tasks require filtering by credit eligibility criteria absent in A's requirements",
        "Dataset B involves partner institution geography identification (e.g., Australian partners) not required in A",
        "Dataset B requires identification of course language offerings while A assumes English by default",
        "Dataset B tasks demand project component verification within Specializations unlike A's general project mentions",
        "Dataset B requires duration filtering with specific time ranges (1-4 years) while A uses broader duration categories",
        "Dataset B includes advanced assessment analysis (number of quizzes) where A only identifies assessment types generally",
        "Dataset B tasks require cross-referencing instructor bios and alternative course offerings beyond A's basic instructor info needs"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B tasks require explicit retrieval of quantitative metrics (e.g., ratings percentages, review counts) while Dataset A focuses on qualitative course information",
        "Dataset B includes tasks requiring analysis of review distribution patterns (e.g., star rating percentages per level) not present in Dataset A",
        "Dataset B tasks demand identification of geographical/institutional partnerships (e.g., Australian universities) not emphasized in Dataset A",
        "Dataset B requires cross-referencing multiple filter parameters simultaneously (e.g., credit eligibility + duration) more frequently than Dataset A",
        "Dataset B contains tasks specifically requesting assessment structure details (e.g., number of quizzes) not commonly required in Dataset A",
        "Dataset B emphasizes identification of instructor bios and alternative course offerings more than Dataset A",
        "Dataset B includes explicit requirements to verify hands-on project components in Specializations unlike Dataset A",
        "Dataset B tasks require comparison of statistical distributions within reviews (e.g., least common rating level) absent in Dataset A",
        "Dataset B contains specific requests for program structure details (e.g., full course lists in Specializations) beyond Dataset A's module-focused tasks",
        "Dataset B tasks demand verification of language options and regional availability more explicitly than Dataset A"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require quantitative analysis of review distributions (e.g., calculating percentage of specific star ratings)",
        "Dataset B queries demand identification of language of instruction for free courses",
        "Dataset B tasks involve filtering by credit eligibility status in addition to standard filters",
        "Dataset B requires explicit identification of learning outcomes/skills developed in specializations",
        "Dataset B tasks necessitate multi-criteria duration filtering (e.g., 1-4 years range specification)",
        "Dataset B queries require aggregation of partner institution lists by geographic location",
        "Dataset B tasks involve identification of assessment components (e.g., number of quizzes)",
        "Dataset B requires comparison of rating percentages across different star levels",
        "Dataset B tasks demand identification of project components within specializations",
        "Dataset B queries involve real-time result counts after applying multiple simultaneous filters"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Dataset B requires quantitative analysis of search results (counting papers, comparing numbers across categories) while Dataset A focuses on information retrieval",
        "Tasks in Dataset B frequently involve date range filtering with explicit start/end dates rather than general recency criteria used in Dataset A",
        "Dataset B contains tasks requiring cross-category comparison searches (e.g. category-specific vs all-archive results) not present in Dataset A",
        "Dataset B includes tasks involving form-based interactions (e.g. survey participation, merchandise store) beyond core search functionality found in Dataset A",
        "Dataset B requires analysis of version history metadata (e.g. identifying v3 submission dates) while Dataset A focuses on basic version awareness",
        "Dataset B contains tasks requiring interpretation of submission guidelines/policies whereas Dataset A focuses on content extraction",
        "Dataset B includes tasks requiring random selection/processing from result sets, unlike the deterministic searches in Dataset A",
        "Dataset B requires navigation to/interpretation of institutional websites (e.g. university statistics) beyond arXiv's academic content in Dataset A",
        "Dataset B contains queries about arXiv's operational status and infrastructure that are absent from Dataset A's content-focused tasks",
        "Dataset B requires analysis of document structure elements (figure/table counts) while Dataset A focuses on textual content extraction"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Dataset B tasks require aggregating quantitative data (e.g., article counts, date comparisons) while Dataset A focuses on singular information retrieval",
        "Dataset B contains time-sensitive constraints (e.g., 'within last two days') not emphasized in Dataset A tasks",
        "Dataset B includes comparative analysis requirements (e.g., category vs. all-archive results) absent in Dataset A",
        "Dataset B tasks involve multi-criteria filtering (date ranges + author counts + content matching) beyond Dataset A's simpler filters",
        "Dataset B requires random selection from result sets, a feature not present in Dataset A tasks",
        "Dataset B contains meta-analytical queries about arXiv itself (categories/abbreviations, submission guidelines) unlike Dataset A's content-focused tasks",
        "Dataset B includes e-commerce interactions (product selection/cart management) not present in Dataset A",
        "Dataset B tasks demand version history tracking (specific submission versions) more explicitly than Dataset A",
        "Dataset B requires cross-referencing paper metadata with external events (conference acceptances) absent in Dataset A",
        "Dataset B contains format-specific content analysis (HTML parsing for goals/motivations) beyond Dataset A's basic format access"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset B tasks require quantitative analysis (e.g., counting articles, figures, tables) while Dataset A focuses on direct retrieval.",
        "Dataset B includes time-bound comparative queries (e.g., 'three most recent announce days') whereas Dataset A uses simple date ranges.",
        "Dataset B tasks involve probabilistic actions (e.g., 'choose one at random') absent in Dataset A.",
        "Dataset B requires cross-referencing submission dates with external events (e.g., 'accepted for AAAI 2024') unlike Dataset A's standalone date queries.",
        "Dataset B contains meta-analytical tasks about arXiv itself (e.g., category abbreviations, submission guidelines) not seen in Dataset A.",
        "Dataset B tasks demand structural understanding of papers (e.g., counting figures/tables) while Dataset A focuses on content sections.",
        "Dataset B includes e-commerce interactions (e.g., 'add to cart') unrelated to core research functions in Dataset A.",
        "Dataset B requires multi-criteria synthesis (e.g., 'submitted between dates AND >5 authors') versus Dataset A's single-filter tasks.",
        "Dataset B contains external platform navigation (e.g., university websites) beyond arXiv's scope in Dataset A.",
        "Dataset B tasks involve citation/version analysis (e.g., 'when was v3 submitted?') while Dataset A focuses on current metadata."
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset B tasks require counting search results or metadata quantities (e.g. 'how many articles/figures') while A focuses on locating specific content",
        "Dataset B contains time-sensitive queries requiring identification of 'most recent' submissions within narrow date ranges (e.g. last 2 days) rather than general date filtering",
        "Dataset B tasks involve verifying conference acceptance status (e.g. AAAI 2024) as part of paper metadata analysis",
        "Dataset B requires interaction with non-research content elements (e.g. shopping cart, university websites) beyond paper repositories",
        "Dataset B tasks demand comparative analysis across search parameters (e.g. category-specific vs. cross-category results)",
        "Dataset B contains explicit version history tracking requirements (e.g. 'when was v3 submitted') beyond general version awareness",
        "Dataset B tasks involve external website navigation from arXiv links (e.g. institutional sites) for supplemental information",
        "Dataset B requires understanding of submission guidelines and technical requirements (e.g. figure formats) rather than just consumption",
        "Dataset B tasks focus on real-time data freshness (e.g. 'latest news', current student numbers) rather than static content retrieval",
        "Dataset B contains interactive elements with dynamic content generation (e.g. random selection from results, QR code sharing)"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Dataset B tasks require quantitative analysis (e.g., counting articles, figures, tables) while Dataset A focuses on retrieval without numerical processing",
        "Dataset B tasks involve comparative queries between category-specific vs. all-archive searches, whereas Dataset A focuses on single-category exploration",
        "Dataset B requires temporal analysis of submission patterns (e.g., most recent announce days) while Dataset A focuses on absolute temporal information retrieval",
        "Dataset B tasks demand interaction with e-commerce elements (e.g., non-profit store, shopping cart) absent in Dataset A requirements",
        "Dataset B includes meta-analytical tasks about arXiv's own structure (e.g., category abbreviations) unlike Dataset A's content-focused tasks",
        "Dataset B requires version history tracking across multiple submissions (e.g., v3 submission dates) while Dataset A focuses on current version retrieval",
        "Dataset B tasks involve multi-step analytical workflows (count->select->summarize) compared to Dataset A's single-objective retrievals",
        "Dataset B contains tasks requiring random sampling from result sets, unlike Dataset A's deterministic retrieval requirements",
        "Dataset B demands content summarization (e.g., main findings) while Dataset A focuses on verbatim information extraction",
        "Dataset B includes cross-platform navigation tasks (e.g., university websites) beyond arXiv's core functionality required in Dataset A"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Dataset B tasks require explicit content summarization and key point extraction from articles, while Dataset A focuses on information retrieval without synthesis",
        "Dataset B includes tasks demanding identification of authorship and publication dates, indicating structured metadata presentation not emphasized in Dataset A",
        "Dataset B contains queries about quantitative analysis of content (e.g., counting sections/players), whereas Dataset A focuses on qualitative information retrieval",
        "Dataset B tasks require cross-referencing specific visual elements with textual content for comprehensive understanding, beyond simple image-text association in Dataset A",
        "Dataset B emphasizes verification of current status in dynamic sections (Market Data, Leaderboards), while Dataset A focuses on static hierarchical navigation",
        "Dataset B tasks involve temporal filtering constraints (e.g., 'within last two days'), requiring more precise timestamp utilization than Dataset A's general recency needs",
        "Dataset B includes analytical tasks about economic/systemic impacts, whereas Dataset A focuses on factual event reporting",
        "Dataset B requires navigation to specialized content guides (e.g., climate change explainers), indicating curated educational content structures not prominent in Dataset A",
        "Dataset B tasks demand geographic specificity in weather/event reporting beyond Dataset A's general regional context extraction",
        "Dataset B contains explicit requests for source attribution verification (e.g., data provenance in Market Data), while Dataset A focuses on content consumption without source validation"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks in B require summarizing article significance/context (e.g. archaeological discoveries' importance) rather than just locating information",
        "B contains tasks requiring identification of human causes behind phenomena (e.g. climate change drivers) through explanatory guides",
        "B includes analytical tasks comparing country representation in ranked lists (e.g. golf leaderboard nationalities)",
        "B requires explicit tracking of author names and institutional affiliations in business/tech reporting",
        "B features tasks involving calendar-based event tracking (e.g. athletics competitions scheduling)",
        "B contains specialized queries about section taxonomy (e.g. counting war-related categories)",
        "B includes meta-analysis of content organization (e.g. The SpeciaList city features in Travel)",
        "B requires interpretation of data visualizations/market data sources in financial reporting",
        "B tasks demand precise matching of article titles (e.g. \"What is climate change? A really simple guide\")",
        "B contains comparative analysis of economic impacts across regions (e.g. Brexit effects on EU economies)"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset B tasks require explicit extraction of article metadata (author names, publication dates) while Dataset A focuses on general content location",
        "Dataset B emphasizes structured data retrieval from specific content types (Leaderboards, Market Data) absent in Dataset A tasks",
        "Dataset B tasks frequently demand quantitative analysis (counting sections, ranking players) not present in Dataset A requirements",
        "Dataset A tasks involve broader content exploration across multiple sections while Dataset B focuses on targeted single-topic extraction",
        "Dataset B requires identification and analysis of visual elements (first picture in stories) not mentioned in Dataset A objectives",
        "Dataset B tasks specifically reference guide-style content (\"really simple guide\" format) not present in Dataset A samples",
        "Dataset A includes entertainment/culture-focused tasks (movie reviews, podcasts) absent from Dataset B's current events focus",
        "Dataset B tasks require temporal precision (\"within last two days\") while Dataset A uses relative timeframes (\"recent\")",
        "Dataset B emphasizes economic/policy analysis (trade deal implications) whereas Dataset A focuses on factual reporting",
        "Dataset A tasks involve multi-platform content (video pages, podcasts) while Dataset B focuses strictly on article-based information"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Tasks in B require summarization of specific article content more frequently than A",
        "B includes explicit instructions to identify and report publication dates/authors not emphasized in A",
        "B tasks demand quantitative answers (counts, rankings) absent in A's requirements",
        "B contains tasks targeting specialized content modules like Market Data not present in A samples",
        "B requires navigation to verify calendar-based information (event dates) unlike A",
        "Tasks in B specify exact article titles to locate rather than general topic searches in A",
        "B emphasizes time-bound constraints (e.g. 'within last two days') more rigorously than A",
        "B includes instructions to analyze human causes in explanatory guides absent in A's tasks",
        "Tasks in B frequently require cross-referencing specific team/player data in sports sections",
        "B contains explicit requirements to identify review content in Culture section not seen in A"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks in B require summarizing key points from articles (e.g., 'summarize the key points discussed') while A focuses on locating/extracting information without synthesis",
        "B includes explicit requirements to identify article authors and publication dates (e.g., 'provide the date of publication') not present in A's tasks",
        "B contains tasks requiring quantitative analysis of content structure (e.g., 'How many War related sections') while A focuses on qualitative distinctions",
        "B requires navigation to specialized content guides (e.g., 'What is climate change? A really simple guide') with explicit named resource targets",
        "B tasks involve interacting with structured data formats (e.g., 'Market Data section', 'Leaderboard') absent from A's requirements",
        "B includes temporal constraints in queries (e.g., 'published within the last two days') while A uses relative timestamps without date ranges",
        "B tasks require cross-sectional comparisons (e.g., 'which country has the most players') involving data aggregation across content",
        "B contains explicit geographic mapping requirements (e.g., 'where and when the severe weather occurred') with spatiotemporal precision",
        "B tasks demand identification of section-specific editorial features (e.g., 'The SpeciaList section in Travel') as distinct content types",
        "B includes meta-analytical requirements about content sourcing (e.g., 'which company the data comes from') beyond content extraction"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset B tasks require multi-step filtering (e.g., price + ratings + size + specific features) in a single query, while Dataset A tasks filter by individual attributes sequentially.",
        "Dataset B tasks explicitly demand validation of return policies or delivery eligibility for specific items, whereas Dataset A focuses on general logistical verification.",
        "Dataset B includes tasks requiring comparison of products based on granular metrics (e.g., review counts >20,000), while Dataset A comparisons focus on basic price/feature contrasts.",
        "Dataset B tasks specify exact numerical thresholds (e.g., '10x zoom', '300 sq ft room size'), whereas Dataset A uses broader qualitative descriptors (e.g., 'eco-friendly').",
        "Dataset B requires sorting results by non-default criteria (e.g., 'Newest Arrivals', 'Best Sellers'), while Dataset A tasks do not involve explicit sorting instructions.",
        "Dataset B tasks involve time-bound product searches (e.g., 'released within a month'), which are absent in Dataset A.",
        "Dataset B includes explicit instructions to save/select items based on derived metrics (e.g., 'lowest priced among results'), while Dataset A tasks involve direct purchases without intermediate curation.",
        "Dataset B tasks mandate validation of compatibility with specific devices (e.g., 'MacBook Pro'), whereas Dataset A compatibility checks are generic (e.g., 'devices').",
        "Dataset B requires confirming availability of niche features (e.g., 'anti-squirrel mechanism'), while Dataset A focuses on core product attributes.",
        "Dataset B tasks specify exact pack quantities (e.g., '10 lights'), whereas Dataset A uses open-ended terms (e.g., 'multiple books')."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Dataset B tasks require handling multi-attribute filtering combinations (e.g. price + size + color + rating thresholds) while Dataset A focuses on single-attribute filters",
        "Dataset B contains explicit requirements for post-purchase information checks (return policies, shipping details) not present in Dataset A",
        "Dataset B tasks involve saving/search persistence actions (\"save the lowest priced\") absent from Dataset A's basic cart operations",
        "Dataset B requires comparison of specific quantitative metrics between results (e.g. room size capacity, zoom ratios) unlike Dataset A's general price comparisons",
        "Dataset B includes time-sensitive product attributes (release dates within 1 month) while Dataset A focuses on existing inventory",
        "Dataset B tasks demand technical compatibility verification (MacBook compatibility) not required in Dataset A's generic product searches",
        "Dataset B contains explicit review volume requirements (\"over 20,000 reviews\") whereas Dataset A only uses star ratings",
        "Dataset B requires identification of specific mechanical features (anti-squirrel mechanisms) vs Dataset A's basic category navigation",
        "Dataset B tasks involve precise measurement ranges (300 sq ft, 10x zoom) compared to Dataset A's qualitative descriptors",
        "Dataset B includes digital content navigation (Kindle Store releases) while Dataset A focuses on physical goods"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks in dataset B require applying multiple simultaneous filters (e.g. price+rating+size+color) with exact numerical thresholds",
        "Dataset B tasks demand validation of specific availability details like color variants and shipping policies post-search",
        "Dataset B requires cross-referencing compatibility specifications (e.g. MacBook Pro compatibility) during product searches",
        "Tasks in B explicitly require sorting mechanisms (e.g. 'Best Sellers') as part of the selection criteria",
        "Dataset B contains tasks requiring price comparison within defined ranges (e.g. $100-300) rather than general affordability checks",
        "B's tasks frequently specify exact measurement requirements (e.g. 5mm thickness, 300 sq ft capacity)",
        "Dataset B requires validation of technical specifications (e.g. HDMI/SD card ports count) beyond basic product categories",
        "Tasks in B demand explicit return policy verification as part of purchase validation",
        "Dataset B includes tasks requiring temporal filtering (e.g. upcoming releases within 1 month)",
        "B's tasks require inventory checks for specific variant combinations (e.g. color+size+pattern availability)"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks in B require specifying multiple attribute combinations (e.g., color + size + rating + price thresholds) simultaneously",
        "B includes explicit instructions to verify exact color count availability for products",
        "Tasks in B demand cross-referencing review counts (e.g., 'over 20,000 reviews') with ratings",
        "B requires identifying products with specific technical compatibility requirements (e.g., MacBook Pro compatibility)",
        "Tasks in B specify exact release date windows (e.g., 'released within a month') for time-sensitive products",
        "B contains instructions to compare prices across precisely defined competitor sets (e.g., 'top three search results')",
        "Tasks in B require verifying energy efficiency certifications/ratings for appliances",
        "B includes explicit instructions to combine sorting (e.g., 'Best Sellers') with filtering constraints",
        "Tasks in B demand checking shipping cost specifics within price constraints (e.g., 'free shipping' verification)",
        "B contains multi-phase tasks requiring sequential filtering then selection from narrowed results"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Dataset B tasks require multi-criteria filtering combinations (e.g., color + size + rating + price) while Dataset A uses singular primary filters",
        "Dataset B emphasizes precise numerical constraints (e.g., 'at least 5mm thickness', '10x zoom') where Dataset A uses relative terms ('eco-friendly', 'luxury')",
        "Dataset B includes explicit comparison requirements between multiple products/results while Dataset A focuses on single product actions",
        "Dataset B tasks demand verification of multi-dimensional availability (color variants, size options, pack quantities) beyond basic stock status checks in Dataset A",
        "Dataset B requires conditional policy analysis (e.g., 'if free return available, explain process') whereas Dataset A only verifies policy existence",
        "Dataset B contains explicit sorting method specifications (e.g., 'sort by Best Sellers') while Dataset A uses generic sorting requirements",
        "Dataset B includes temporal constraints for upcoming releases/future dates not present in Dataset A tasks",
        "Dataset B requires quantitative measurement validation (e.g., '300 sq ft coverage', '20,000 reviews') absent in Dataset A's qualitative thresholds",
        "Dataset B tasks specify exact compatibility requirements (e.g., 'MacBook Pro compatible') versus Dataset A's general compatibility mentions",
        "Dataset B contains post-selection actions (e.g., 'save lowest priced result') while Dataset A focuses on immediate cart additions"
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Dataset B tasks require integration of personalized health metrics (e.g., age, weight, height) in calculations",
        "Dataset B includes multi-step unit conversions involving compound substances (e.g., mass-to-molar conversions with elemental composition analysis)",
        "Dataset B tasks demand polynomial simplification with explicit formatting constraints (e.g., reducing terms while maintaining equivalence)",
        "Dataset B contains queries about hypothetical scenario modeling with fixed assumptions (e.g., standardized food serving weights)",
        "Dataset B requires biological relationship quantification through fractional genetic inheritance calculations",
        "Dataset B tasks involve convergence/divergence analysis of infinite series with specific mathematical tests",
        "Dataset B includes differential equation solutions requiring constant determination through boundary conditions",
        "Dataset B combines astronomical property comparisons with planetary day length calculations in single queries",
        "Dataset B tasks incorporate multi-decade currency valuation adjustments with inflation considerations",
        "Dataset B requires environmental impact calculations based on geographic location and biological factors (e.g., UV exposure timelines)"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Tasks in B require personalized health or fitness calculations (e.g., age, weight, height).",
        "B includes dynamic physical system simulations (e.g., spring pendulums, ODEs with trigonometric terms).",
        "B emphasizes combinatorial or optimization problems (e.g., polyomino combinations, polynomial simplification).",
        "Tasks in B demand precise numerical formatting (e.g., scientific notation with specific significant figures).",
        "B involves multi-part unit conversions with compound outputs (e.g., mass to molar + elemental composition).",
        "B includes real-world financial comparisons across time (e.g., historical currency value adjustments).",
        "B requires solving non-linear differential equations with boundary conditions (e.g., g'(0)=1).",
        "Tasks in B focus on material property comparisons (e.g., thermal conductivity of metals at specific temperatures).",
        "B contains applied geometry constraints (e.g., inner regions of geometric shapes like pentagrams).",
        "B incorporates temporal-spatial calculations (e.g., sunburn duration based on location and skin type)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Dataset B tasks require specific numerical constraints or parameters (e.g., age, weight, time duration) for personalized calculations.",
        "Dataset B tasks involve combinatorial or geometric optimization (e.g., polyomino combinations, shape constraints in pentagram inequalities).",
        "Dataset B tasks focus on multi-variable comparisons (e.g., calorie comparisons between food items, material conductivity comparisons).",
        "Dataset B tasks explicitly demand result simplification (e.g., polynomial simplification, reducing item count in expressions).",
        "Dataset B tasks include multi-part questions requiring combined outputs (e.g., mass of Jupiter *and* day length).",
        "Dataset B tasks involve family relationship quantification (e.g., blood relationship fractions).",
        "Dataset B tasks require direct conversion between physical quantities and molar/chemical composition.",
        "Dataset B tasks emphasize precision specifications (e.g., 'retain 5 significant figures in scientific notation').",
        "Dataset B tasks focus on time-sensitive real-world data (e.g., 2023 prices, modern inflation-adjusted currency values).",
        "Dataset B tasks apply mathematical concepts to biomechanical/physical systems (e.g., spring pendulum dynamics, sunburn time calculation)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Dataset B tasks require multi-variable parametric inputs with real-time physical constraints (e.g., mass, spring constants, time-dependent variables).",
        "Dataset B queries involve dynamic parameter adjustments for iterative problem-solving (e.g., ODE initial conditions, convergence testing).",
        "Dataset B tasks combine physical system modeling with mathematical computation (e.g., pendulum dynamics, thermodynamic calculations).",
        "Dataset B requires symbolic equation manipulation alongside numerical computation (e.g., polynomial simplification, series convergence analysis).",
        "Dataset B tasks focus on multi-output unit conversions with contextual interpretation (e.g., mass-to-molar conversions with elemental composition).",
        "Dataset B emphasizes combinatorial mathematics applications (e.g., polyomino permutations, row-based shape constraints).",
        "Dataset B queries integrate temporal-spatial relationships (e.g., sunburn time calculations with geographic/time parameters).",
        "Dataset B tasks demand precision control in results formatting (e.g., scientific notation with specific significant figures).",
        "Dataset B requires cross-domain data synthesis (e.g., combining nutritional data with biomechanical energy expenditure).",
        "Dataset B tasks involve conditional constant determination in differential equations through boundary value constraints."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Dataset B tasks require handling precise numerical parameters (e.g., age, weight, velocity) for personalized calculations.",
        "Dataset B tasks demand multi-step computations combining unit conversions with domain-specific analyses (e.g., chemistry to molar conversions).",
        "Dataset B tasks involve dynamic physical system modeling (e.g., spring pendulums, differential equations with initial conditions).",
        "Dataset B tasks specify exact formatting requirements for outputs (e.g., scientific notation with significant figures).",
        "Dataset B tasks focus on real-world optimization (e.g., simplifying polynomials, constraint-based inequalities).",
        "Dataset B tasks require interdisciplinary integration (e.g., merging biology, physics, and nutrition for calorie burn estimates).",
        "Dataset B tasks include conditional scenarios (e.g., SPF-based sunburn time by skin type, temperature-dependent material properties).",
        "Dataset B tasks emphasize practical, time-sensitive applications (e.g., weight loss timelines, real-time pricing comparisons).",
        "Dataset B tasks target complex relationship mapping (e.g., familial blood fractions, combinatorial polyomino configurations).",
        "Dataset B tasks involve comparative aggregation across multiple entities (e.g., food calorie comparisons, multi-city price averages)."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Dataset B tasks emphasize finding recipes with specific review count thresholds (e.g. >200 reviews), while Dataset A focuses on general popularity metrics",
        "Dataset B tasks require structured outputs like ingredient lists and step-by-step instructions, while Dataset A focuses on recipe discovery/saving actions",
        "Dataset B explicitly requests rating filters (e.g. 4+ stars) in 80% of tasks, compared to Dataset A's more general quality assessments",
        "Dataset A contains more time-sensitive queries (e.g. 'quick', 'weeknight'), while Dataset B emphasizes comprehensive recipe analysis",
        "Dataset B tasks frequently specify serving size requirements (e.g. 'suitable for 6 people'), absent in Dataset A queries",
        "Dataset A shows stronger focus on dietary substitutions (gluten-free, keto alternatives), while Dataset B emphasizes recipe authenticity",
        "Dataset B includes comparative analysis tasks (e.g. 'list 3 recommended'), whereas Dataset A focuses on single-recipe retrieval",
        "Dataset A tasks contain more ingredient-specific queries (e.g. 'leftover cranberries'), while Dataset B uses broader category searches",
        "Dataset B requires calorie counting/nutritional analysis in 30% of tasks, compared to Dataset A's general health focus",
        "Dataset A features more preservation actions (save/bookmark), while Dataset B emphasizes information synthesis/transformation"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks in B require locating recipes with significantly higher minimum review counts (e.g., 200+ or 500+ reviews) compared to A's lower thresholds (e.g., 100+ reviews).",
        "B's tasks emphasize recipes with precise star ratings (e.g., 4.5 stars or higher) rather than A's general minimum thresholds (e.g., 4 stars).",
        "B includes explicit requirements for structured outputs like ingredient lists, preparation steps, or shopping lists, whereas A focuses on discovery/saving actions.",
        "Tasks in B frequently specify exact cuisine types (e.g., Italian, Greek) rather than A's broader meal categories (e.g., dinner, dessert).",
        "B requires identifying recipes with quantified nutritional constraints (e.g., under 600 calories/serving) while A only mentions general nutritional parsing.",
        "Tasks in B demand recipe summaries including prep/cook time breakdowns, which are not explicitly required in A's tasks.",
        "B includes requests for metadata analysis (e.g., latest review content, average rating calculations) not present in A's tasks.",
        "Tasks in B often require scaling recipes to specific serving sizes (e.g., 'suitable for 6 people'), unlike A's general serving needs.",
        "B emphasizes recipes with extreme popularity thresholds (e.g., 'over 1000 reviews') compared to A's moderate popularity requirements.",
        "Tasks in B explicitly demand identification of specific ingredients (e.g., 'primary cheese used') rather than A's general dietary filters."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Dataset A tasks involve saving/bookmarking specific recipes (e.g. 'Save three new recipes'), while Dataset B focuses on retrieving existing recipe metadata without saving actions",
        "Dataset B tasks require precise numerical thresholds (e.g. 'over 500 reviews', '4.5 stars'), while Dataset A uses relative qualifiers like 'highly-rated' without specific numbers",
        "Dataset A contains tasks requiring user-generated content interaction (e.g. 'Leave a review'), while Dataset B focuses solely on information retrieval",
        "Dataset B tasks demand detailed nutritional analysis (e.g. 'under 600 calories per serving'), while Dataset A mentions calorie counts without specific constraints",
        "Dataset A includes recipe modification requests (e.g. 'Get suggestions to modify'), while Dataset B focuses on finding existing recipes as-is",
        "Dataset B requires multi-criteria filtering (e.g. 'vegetarian + <30min prep + >4 stars'), while Dataset A tasks use single constraints",
        "Dataset A tasks involve personalization (e.g. 'kid-friendly', 'meal prep'), while Dataset B uses objective metrics",
        "Dataset B tasks require output formatting (e.g. 'shopping list creation'), while Dataset A focuses on information discovery",
        "Dataset A contains exploratory tasks (e.g. 'Explore different recipes'), while Dataset B uses targeted search with exact parameters",
        "Dataset B tasks specify audience size (e.g. 'suitable for 6 people'), while Dataset A focuses on individual use cases"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Dataset B tasks require recipes with higher review thresholds (e.g., 'over 500 reviews') compared to Dataset A.",
        "Dataset B emphasizes structured outputs (e.g., ingredient lists, step-by-step instructions) more explicitly than Dataset A.",
        "Dataset B tasks frequently specify exact rating thresholds (e.g., '4.5 stars or higher'), whereas Dataset A uses broader rating ranges (e.g., '4 stars or higher').",
        "Dataset B includes tasks requiring calorie restrictions per serving (e.g., 'under 600 calories'), which are absent in Dataset A.",
        "Dataset B tasks often demand quantifiable constraints (e.g., '10 ingredients or less') not commonly seen in Dataset A.",
        "Dataset B tasks explicitly require summaries of recipes (e.g., ingredients, prep time, cooking instructions), unlike Dataset A.",
        "Dataset B tasks prioritize recipes with exceptionally high popularity (e.g., 'over 1000 reviews'), while Dataset A focuses on general popularity.",
        "Dataset B tasks include creating shopping lists, a requirement not present in Dataset A.",
        "Dataset B tasks frequently involve multi-filter criteria (e.g., 'vegetarian + under 1 hour prep + 4.5 stars'), whereas Dataset A tasks use simpler filters.",
        "Dataset B tasks explicitly target niche dietary needs (e.g., vegan, Mediterranean diet) more often than Dataset A."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Dataset B tasks require filtering by exact numerical thresholds (e.g., >200 reviews), while Dataset A focuses on general popularity metrics without specific counts",
        "Dataset B emphasizes strict rating requirements (e.g., minimum 4.5 stars), whereas Dataset A uses qualitative terms like 'top-rated' without numeric thresholds",
        "Dataset B tasks demand structured output formats (ingredient lists, step-by-step instructions), while Dataset A focuses on discovery/saving without output specifications",
        "Dataset B explicitly requires dietary-specific recipes (vegan/vegetarian) with hard constraints, while Dataset A mentions dietary preferences with flexible implementation",
        "Dataset B tasks involve precise nutritional/time parameters (e.g., <600 calories, <1 hour prep), whereas Dataset A uses relative terms like 'quick' or 'easy'",
        "Dataset B requires verification of recipe popularity through review counts (>500 reviews), while Dataset A emphasizes recent/seasonal content over historical engagement",
        "Dataset B tasks prioritize existing high-performing recipes, while Dataset A includes creative adaptation tasks (e.g., leftover ingredient utilization)",
        "Dataset B specifies exact ingredient combinations (e.g., chicken+quinoa), while Dataset A uses broader ingredient filters without mandatory pairings",
        "Dataset B requires comparative analysis of multiple recipes, while Dataset A focuses on single-recipe interactions like reviews/bookmarking",
        "Dataset B tasks involve quantitative data extraction (cooking times, serving sizes), while Dataset A emphasizes qualitative evaluation through user reviews"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Dataset B tasks require users to provide example sentences in their responses, while Dataset A does not explicitly mandate this.",
        "Dataset B includes tasks that ask for the specific translation service provider used (e.g., 'which company provided the translation'), whereas Dataset A focuses only on translation outputs.",
        "Dataset B requires identification of grammatical rule applications in multiple sentence formats (affirmative/negative/interrogative), while Dataset A focuses on general grammar exploration.",
        "Dataset B tasks demand quantitative analysis of word meanings (e.g., 'how many meanings'), which Dataset A does not require.",
        "Dataset B includes product-oriented tasks (e.g., 'listing 3 items' in the Shop section) not present in Dataset A.",
        "Dataset B specifies interactive challenges with time constraints in game tasks (e.g., 'beat the clock'), while Dataset A only references general gameplay.",
        "Dataset B emphasizes dual accent pronunciation validation (explicitly requiring both UK/US) for all pronunciation tasks, whereas Dataset A sometimes includes single-accent requests.",
        "Dataset B requires multi-language translation verification (e.g., Chinese and French translations simultaneously), while Dataset A focuses on single-language pair translations.",
        "Dataset B tasks involve identifying grammatical structures through pattern recognition (e.g., 'prepositions that consist of groups of words'), unlike Dataset A's general grammar queries.",
        "Dataset B mandates contextual usage analysis through multiple example sentences for single words, while Dataset A typically requires one usage example."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Dataset B requires users to explicitly count and report the number of distinct meanings/definitions for polysemous words (e.g. 'How many meanings of...'), while Dataset A only asks for identification without quantification.",
        "Tasks in Dataset B specifically demand translations into multiple target languages within a single query (e.g. Chinese AND French), whereas Dataset A requests single-language translations per task.",
        "Dataset B includes tasks that require interaction with commercial sections (Cambridge Dictionary Shop) and product listings, which are absent from Dataset A's requirements.",
        "Dataset B tasks explicitly reference and require use of the Word Scramble game feature in the Plus section, while Dataset A only mentions general vocabulary features.",
        "Dataset B contains tasks that ask users to identify the corporate provider/attribution for translation services, a requirement not present in Dataset A.",
        "Grammar-related tasks in Dataset B specifically target particular grammatical constructs (e.g. 'modal verbs for possibility') rather than general grammar exploration as in Dataset A.",
        "Dataset B requires users to demonstrate multi-step game interaction (e.g. 'try the first example' in Word Scramble), while Dataset A focuses on static information retrieval.",
        "Tasks in Dataset B frequently combine pronunciation retrieval with requirements for both UK/US variants and contextual example sentences in single queries, whereas Dataset A often separates these elements.",
        "Dataset B includes explicit instructions to validate translation directionality (e.g. 'change language direction'), while Dataset A assumes default bidirectional translation capabilities.",
        "Dataset B tasks require production of multiple contextual examples demonstrating different usage scenarios for the same term, whereas Dataset A typically requests single example retrievals."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Dataset B tasks require providing numerical answers (e.g., count of definitions)",
        "Dataset B includes tasks involving explicit exploration of the Shop section",
        "Dataset B tasks require identifying third-party service attribution for translations",
        "Dataset B tasks specify direct interaction with word games (e.g., 'try the first example')",
        "Dataset B requires comparing prepositions consisting of word groups in grammar section",
        "Dataset B tasks demand listing specific quantities of items (e.g., '3 items')",
        "Dataset B includes verification of translation service providers",
        "Dataset B requires identification of grammatical structure categories (e.g., 'groups of words')",
        "Dataset B tasks specify analyzing commercial/store content",
        "Dataset B includes explicit requirements for testing game mechanics"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Dataset A tasks require sharing definitions via social media features (e.g., Twitter) while Dataset B tasks do not include social media sharing actions",
        "Dataset B tasks explicitly require counting/numbering definitions (e.g., 'how many meanings') while Dataset A focuses on retrieval without quantification",
        "Dataset B tasks specify requirements for multiple contextual examples (e.g., 'use it in two example sentences') while Dataset A requests single examples",
        "Dataset B includes tasks requiring identification of translation service providers/attribution while Dataset A focuses purely on translation retrieval",
        "Dataset B tasks demand interaction with commercial sections (e.g., 'list 3 Shop items') while Dataset A only requires navigation to these sections",
        "Dataset B contains explicit grammar structure analysis (e.g., 'present perfect simple uses') while Dataset A addresses broader grammar categories",
        "Dataset B tasks require validation of game interactions (e.g., 'Can you beat the clock?') while Dataset A mentions games without performance verification",
        "Dataset B includes language direction specificity in translations (e.g., 'English-Chinese') while Dataset A uses general translation tasks",
        "Dataset B tasks focus on morphological components (e.g., 'prepositions that consist of groups of words') while Dataset A addresses broader grammatical concepts",
        "Dataset B requires comparison of regional pronunciation variants within single tasks while Dataset A treats them as separate lookup actions"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Dataset B includes tasks that require interaction with e-commerce features (e.g., browsing the Cambridge Dictionary Shop section).",
        "Dataset B tasks involve navigating to and interacting with gamified elements (e.g., Word Scramble games in the Plus section).",
        "Dataset B requires users to identify third-party services or advertisements integrated into the platform (e.g., recognizing translation service providers).",
        "Dataset B tasks include actions like creating/downloading user-generated word lists or quizzes via the +Plus feature.",
        "Dataset B tasks explicitly require users to identify external companies or tools linked to specific features (e.g., translation tool providers).",
        "Dataset B tasks involve navigating to product-specific pages (e.g., listing Shop items).",
        "Dataset B includes tasks that require counting or listing items from non-lexical sections (e.g., Shop products).",
        "Dataset B tasks involve timed challenges (e.g., beating the clock in word games).",
        "Dataset B tasks require users to interact with promotional or marketing content (e.g., Plus section promotions).",
        "Dataset B tasks focus on exploring non-core dictionary features (e.g., Shop, games, third-party tools) alongside traditional lookup tasks."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Dataset B tasks focus on retrieving exact numerical values (weight, storage, quantity) while Dataset A emphasizes feature comparisons and compatibility checks",
        "Dataset B requires identifying specific component options during product customization processes whereas Dataset A focuses on accessory pairing recommendations",
        "Dataset B tasks frequently involve checking local availability/pickup options while Dataset A focuses on regional release dates at country level",
        "Dataset B contains tasks requiring mathematical calculations between upgrade options whereas Dataset A focuses on qualitative feature comparisons",
        "Dataset B emphasizes enumeration tasks (list types/counts of products) while Dataset A focuses on finding detailed technical specifications",
        "Dataset B tasks target standalone product specifications whereas Dataset A requires cross-referencing between multiple products/accessories",
        "Dataset B includes explicit requests for hardware component specifications while Dataset A focuses more on software/service integration details",
        "Dataset B tasks require identification of specific model variants/configuration combinations unavailable in Dataset A's broader comparisons",
        "Dataset B emphasizes physical product attributes (dimensions, colors) while Dataset A focuses on technical performance characteristics",
        "Dataset B contains tasks requiring verification of specific feature availability per model whereas Dataset A focuses on general compatibility checks"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks require identifying exact product weight and dimensional measurements (e.g. Vision Pro weight)",
        "Navigation involves verifying specific regional release dates (e.g. Vision Pro availability locations)",
        "Requires counting distinct product variants/types (e.g. number of AirPods models)",
        "Tasks demand listing specific hardware-supported features (e.g. Siri Remote capabilities)",
        "Requires identification of exact base model configurations without optional accessories",
        "Navigation paths involve checking compatibility matrices (e.g. iOS version with specific devices)",
        "Tasks require extracting numerical specifications from technical documentation (storage/RAM details)",
        "Involves verifying in-store inventory availability by zip code",
        "Requires price calculation across multiple upgrade tiers (base vs max configuration)",
        "Tasks involve identifying product marketing slogans from category pages"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks in B require enumeration of exact quantities (e.g., 'list 5 Built-in Apps') not present in A",
        "B includes explicit requests for device weight measurements while A focuses on general specifications",
        "B tasks demand identification of specific software/hardware compatibility details (e.g., iOS 17 with iPhone 12)",
        "B requires checking precise accessory configurations (e.g., 'no engraving, no apple pencil' specifications)",
        "B contains tasks requiring price difference calculations between base and maximum upgrade configurations",
        "B tasks involve direct comparisons of technical specifications between consecutive product generations (e.g., iPhone 14 Pro vs 15 Pro)",
        "B requires identification of specific component features (e.g., 'Wireless pairing and charging' capabilities)",
        "B includes tasks for verifying product slogan information not present in A's requirements",
        "B tasks demand checking availability of specific color variants for products",
        "B contains explicit requests for password recovery solutions while A focuses on general support resources"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks focus on retrieving exact numerical specifications (weight, dimensions) rather than general product features",
        "Dataset B requires checking regional availability for unreleased products, while A focuses on existing regional availability information",
        "Dataset B tasks involve explicit compatibility verification between specific OS versions and hardware models",
        "Dataset B contains queries requiring inventory checks with geographic specificity (zip code-based availability)",
        "Dataset B tasks demand counting distinct product variants/options available in customization interfaces",
        "Dataset B requires price calculation through component upgrades rather than simple configuration comparisons",
        "Dataset B tasks involve identifying specific accessory compatibility (Apple Pencil generations)",
        "Dataset B focuses on enumerating physical product characteristics (colors, materials) rather than technical specifications",
        "Dataset B requires direct extraction of marketing slogans/claims rather than general promotional content understanding",
        "Dataset B tasks emphasize password recovery workflows rather than general account management procedures"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Dataset B tasks require identification of precise physical measurements (weight, dimensions) not just general specifications",
        "Dataset B includes tasks demanding enumeration of supported built-in apps/software for specific devices",
        "Dataset B requires explicit counting of available product variants/types (e.g. AirPods models, keyboard types)",
        "Dataset B tasks involve checking compatibility with older device generations (3+ years) rather than immediate predecessors",
        "Dataset B contains tasks requiring zip code-based accessory availability checks beyond general product inventory",
        "Dataset B tasks demand step-by-step upgrade cost calculations from base to maximum configurations",
        "Dataset B requires identification of specific component features (e.g. Siri Remote capabilities)",
        "Dataset B includes tasks focused on exact release timelines rather than general availability windows",
        "Dataset B tasks require direct comparison of chipset differences between non-consecutive models",
        "Dataset B contains tasks needing explicit identification of wireless charging capabilities for accessories"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Dataset B tasks focus on retrieving singular, exact data points (e.g., statistics, dates, names), while Dataset A emphasizes multi-result comparisons or synthesis",
        "B tasks require accessing third-party platforms (e.g., YouTube, Reddit, GitHub) for data extraction, while A tasks primarily use search engines or aggregated results",
        "B tasks demand parsing platform-specific structured data (e.g., comment sections, commit histories), whereas A tasks involve broader web navigation across generic sites",
        "B tasks frequently involve real-time or dynamically updated information (e.g., live scores, current charts), while A focuses on recent but stable data (e.g., news articles)",
        "B tasks necessitate precise query specifications (e.g., exact titles, journal names), while A tasks allow exploratory searches with iterative refinement",
        "B tasks focus on technical/quantitative outputs (e.g., SHA hashes, member counts), whereas A tasks prioritize qualitative analysis (e.g., job descriptions, symptom comparisons)",
        "B tasks require understanding temporal immediacy constraints (e.g., \"as of today\"), while A tasks use relative timeframes (e.g., \"recent\")",
        "B tasks involve user-generated content platforms (e.g., social media comments), while A tasks focus on institutional sources (e.g., health organizations, job boards)",
        "B tasks target platform-native features (e.g., Spotify charts, GitHub commits), whereas A tasks use conventional web interfaces (e.g., product pages, news sites)",
        "B tasks prioritize atomic factual retrieval (e.g., elevation values), while A tasks often require contextual interpretation (e.g., flu risk factors)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks focus more on precise numerical/statistical data retrieval (elevations, dates, counts)",
        "Dataset B contains explicit credential-based interaction tasks (login attempts with specific credentials)",
        "Dataset B emphasizes real-time/live data verification (current game scores, air quality indices)",
        "Dataset B requires extraction of platform-specific metadata (comment stats, commit hashes, chart rankings)",
        "Dataset B tasks frequently involve third-party platform interactions (Twitter, Spotify, GitHub, Reddit)",
        "Dataset A contains more educational/course-related objectives (language learning, university programs)",
        "Dataset A includes tasks requiring content creation/modification (recipe databases, citation formatting)",
        "Dataset A shows stronger emphasis on health/medical information synthesis and analysis",
        "Dataset A contains more complex transactional goals (event bookings, purchases, job applications)",
        "Dataset B prioritizes temporal precision constraints ('as of today's date', 'latest...as of now')"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset B tasks emphasize immediate retrieval of exact numerical/statistical values (e.g., elevation measurements, pollution statistics) while A focuses on exploratory understanding of concepts/trends",
        "Dataset B requires direct verification of authentication outcomes (e.g., login success/failure confirmation) while A focuses on form completion processes without explicit outcome checks",
        "Dataset B contains tasks requiring extraction of precise metadata from specific multimedia content (e.g., YouTube comment details) while A focuses on general multimedia discovery",
        "Dataset B tasks demand real-time validation of dynamic data (e.g., current air quality index, live game scores) while A includes historical/archival data retrieval",
        "Dataset B emphasizes parsing platform-specific community metrics (e.g., Reddit member counts, GitHub commit details) while A focuses on professional/academic content analysis",
        "Dataset B requires identification of ordinal rankings/positions (e.g., top 10 charts, #1 artist) while A focuses on comparative analysis between entities",
        "Dataset B tasks involve explicit temporal constraints (\"as of today's date\", \"latest\") for currency verification while A uses relative time references",
        "Dataset B contains tasks requiring extraction of technical specifications from software/hardware systems while A focuses on program/career requirements",
        "Dataset B emphasizes direct citation of source materials (e.g., journal names, blog references) in responses while A focuses on cross-referencing without explicit sourcing",
        "Dataset B includes tasks requiring atomic data unit extraction (e.g., first 7 bits of SHA hash) while A focuses on holistic information synthesis"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Dataset B tasks require precise numerical or exact factual answers (e.g., elevation, SHA commit bits, goal counts).",
        "Dataset B tasks involve direct verification of user credentials or login success (e.g., testing Twitter login).",
        "Dataset B tasks focus on retrieving platform-specific metrics (e.g., Reddit member counts, Spotify song rankings).",
        "Dataset B tasks demand real-time or dynamically updated data validation (e.g., today's air quality, live NBA scores).",
        "Dataset B tasks target granular technical specifications (e.g., iPhone software requirements for AirDrop functionality).",
        "Dataset B tasks require extracting metadata from third-party platforms (e.g., YouTube comment authorship, GitHub commit details).",
        "Dataset B tasks emphasize parsing ranked/top-N lists (e.g., top 10 songs, trending destinations).",
        "Dataset B tasks include explicit time-bound constraints (e.g., \"as of today's date\", \"latest commit\").",
        "Dataset B tasks involve community-driven content validation (e.g., Reddit community stats, trending blog rankings).",
        "Dataset B tasks require cross-referencing temporal data (e.g., \"year before last\", \"from 2020\u20132021\")."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Tasks in B require precise numerical answers (e.g., elevation, dates, counts) rather than qualitative or descriptive information retrieval",
        "B includes explicit credential-based interactions (e.g., login attempts) as primary task objectives",
        "B tasks demand extraction of platform-specific metrics (e.g., social media engagement stats, commit hashes)",
        "B requires parsing user-generated content (e.g., YouTube comments, Reddit posts) as core task requirements",
        "B emphasizes real-time environmental/geospatial data retrieval (e.g., air quality, planetary distances)",
        "B tasks frequently require multi-platform data aggregation (e.g., GitHub + Spotify + Reddit in single query)",
        "B focuses on technical system specifications (e.g., hardware requirements, software version compatibility)",
        "B includes explicit ranking/position extraction (e.g., 'top-10', 'number one artist') as core task objectives",
        "B tasks require temporal precision constraints (e.g., 'as of today's date', 'latest') in all temporal queries",
        "B demands structured data extraction from academic/professional databases (e.g., journal papers, code repositories)"
      ]
    }
  }
}