{
  "sims": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Tasks require searching for specific locations (e.g., cities, landmarks, businesses) using search functionality",
        "Users need to filter results by type (e.g., restaurants, hotels, parking, transit)",
        "Navigation tasks involve route planning between multiple points (origin to destination)",
        "Tasks require checking and comparing user ratings/reviews of locations",
        "Users need to verify operational details (opening hours, availability status)",
        "Tasks involve finding location-specific amenities (parking, accessibility features)",
        "Queries require proximity-based searches (near me, within distance limits)",
        "Tasks demand multi-step interactions (search -> filter -> compare -> select)",
        "Users need to access transportation mode options (driving, transit, walking, biking)",
        "Tasks require extracting detailed place information (pricing, services, menu items)"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Tasks require searching for specific geographic points of interest (e.g., hotels, restaurants, parks, transit hubs).",
        "Queries involve retrieving detailed location attributes (e.g., ratings, operating hours, accessibility features, parking availability).",
        "Navigation tasks frequently include route planning between two or more geographic coordinates.",
        "Users must filter results using constraints like price ranges, star ratings, proximity thresholds, or real-time status (e.g., open now).",
        "Tasks often require extracting structured data from unstructured user reviews or comments.",
        "Queries emphasize accessibility requirements (e.g., wheelchair-friendly amenities, EV charging stations, parking lot availability).",
        "Instructions target transactional actions (e.g., booking reservations, printing route details, checking schedules).",
        "Tasks rely on geographic specificity (e.g., zip codes, neighborhoods, landmarks, intersections) for precision.",
        "Users compare multiple options to meet dynamic criteria (e.g., highest-rated, least crowded, closest distance).",
        "Tasks blend commercial services (e.g., hotels, stores) and public infrastructure (e.g., trails, transit stops, museums)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Tasks involve searching for specific locations or points of interest (e.g., hotels, restaurants, parks) using geographic or contextual filters.",
        "Users frequently request route planning (e.g., driving, walking, transit) between two or more locations with details like accessibility or traffic.",
        "Tasks require filtering results by criteria such as ratings (e.g., '4.5 stars or higher'), price range, or amenities (e.g., parking, pools).",
        "Queries often include proximity constraints (e.g., 'within 2 miles', 'near a landmark') for location-based searches.",
        "Users seek operational or logistical details (e.g., hours of operation, admission prices, parking availability, transit schedules).",
        "Tasks involve evaluating user reviews or comments to inform decisions (e.g., 'check user comments about parking lots').",
        "Accessibility requirements (e.g., wheelchair-accessible routes, amenities) are explicitly mentioned in multiple tasks.",
        "Users aim to compare options (e.g., hotels, restaurants) based on specific attributes like price, ratings, or guest capacity.",
        "Tasks include time-sensitive constraints (e.g., 'January 11th availability', 'open now') for bookings or visits.",
        "Actions like making reservations, booking services, or printing/saving route details are common objectives."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Both datasets involve tasks requiring search for specific locations (e.g., cities, landmarks, trails) using geographic or address-based criteria.",
        "Tasks in both datasets require filtering results by accessibility features (e.g., wheelchair access, parking availability).",
        "Both include route planning tasks involving multiple transportation modes (driving, public transit, walking).",
        "Tasks frequently require extracting detailed operational information (hours, pricing, availability of amenities/services).",
        "Both datasets involve comparative analysis (e.g., price comparisons, rating thresholds like 4.5+ stars).",
        "Tasks demand interaction with user-generated content (reviews, comments, ratings) for decision-making.",
        "Both require validating real-time or location-specific data (traffic conditions, open/closed status).",
        "Tasks involve multi-step navigation (e.g., find location \u2192 check reviews \u2192 verify accessibility \u2192 plan route).",
        "Both datasets include tasks requiring proximity-based searches (e.g., \"near X landmark,\" \"within Y miles\").",
        "Tasks in both datasets emphasize structured data extraction (e.g., levels of a building, specific amenity lists)."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Both datasets focus on location-based searches (e.g., cities, landmarks, businesses).",
        "Tasks involve querying specific amenities (e.g., parking, restaurants, transit).",
        "Users frequently seek directions/routes between points (e.g., driving, biking, transit).",
        "Both require filtering results by criteria like ratings (>4.8 stars), price range, or accessibility.",
        "Tasks include time-sensitive queries (e.g., 'currently open', '24-hour parking').",
        "Both emphasize accessibility features (e.g., wheelchair-accessible restaurants, parking).",
        "Users request operational details like hours, availability, and service status.",
        "Tasks involve multi-step planning (e.g., itineraries with stops, combined transit modes).",
        "Both require parsing business/service details (e.g., menus, reviews, amenities).",
        "Users utilize geographic parameters (e.g., zip codes, 'near me', proximity to landmarks)."
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks involve searching for repositories with specific criteria (e.g., stars, language, recency)",
        "Tasks require comparing pricing plans or feature tiers (e.g., Copilot, Enterprise vs. Team)",
        "Tasks include navigating security-related content (e.g., vulnerabilities, advisories, Dependabot)",
        "Tasks involve interacting with GitHub Copilot features (e.g., pricing, FAQs, use cases)",
        "Tasks require locating customer success stories or case studies",
        "Tasks focus on educational resources (e.g., GitHub Skills, student packs, onboarding)",
        "Tasks involve project/repository management (e.g., issues, contributors, commit history)",
        "Tasks require account creation/sign-up processes or policy review (e.g., privacy, terms)",
        "Tasks involve analyzing repository metrics (e.g., forks, stars, contributors, activity)",
        "Tasks include workflow automation features (e.g., GitHub Actions, CI/CD pipelines)"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks require locating repository-specific information like stars, forks, and programming languages",
        "Tasks involve navigating to pricing/plan comparison sections (e.g., Enterprise vs Team plans)",
        "Tasks require finding documentation for specific features (Copilot, Advanced Security, GraphQL API)",
        "Tasks involve searching for customer success stories across different industries",
        "Tasks require accessing educational resources (GitHub Skills courses, documentation guides)",
        "Tasks involve comparing product capabilities across different tiers (Free vs Paid features)",
        "Tasks require filtering repositories by recency criteria (last updated/created dates)",
        "Tasks involve locating security-related information (advisories, Dependabot, secret scanning)",
        "Tasks require identifying contributor statistics (top contributors, commit history)",
        "Tasks involve finding official partnership announcements or product update information"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks involve navigating GitHub's product pages to locate specific features or documentation (e.g., Copilot, Advanced Security, CLI)",
        "Both require searching/filtering repositories by criteria like stars, forks, programming language, or recency",
        "Tasks involve comparing pricing tiers (e.g., Copilot plans, Enterprise vs Team storage)",
        "Navigation includes accessing security-related content (CodeQL, Dependabot, secret scanning)",
        "Tasks require interacting with GitHub's educational resources (Skills courses, Classroom, documentation)",
        "Both datasets include tasks involving customer success stories and use cases",
        "Tasks require understanding GitHub's project management features (Issues, Projects, workflow runs)",
        "Navigation paths involve account management flows (signup, privacy settings, enterprise trials)",
        "Both require locating developer tools (GitHub Actions, Codespaces, Mobile app features)",
        "Tasks involve parsing technical documentation (API usage, Markdown formatting, GraphQL implementation)"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Tasks focus on locating GitHub Copilot-related information including features, pricing, and usage policies.",
        "Queries involve comparing GitHub's pricing tiers and plan features (e.g., Free vs. Pro vs. Enterprise).",
        "Tasks require locating specific repository attributes like stars, forks, or recent commits.",
        "Navigation goals include finding security-related content such as CVEs, vulnerabilities, or security tools.",
        "Users seek customer success stories or case studies displayed on GitHub's marketing pages.",
        "Tasks involve identifying GitHub's project management features like Issues, Projects, or Milestones.",
        "Queries target GitHub's educational resources including Skills courses or Classroom functionalities.",
        "Tasks require filtering repositories by programming language, update date, or creation date.",
        "Navigation includes exploring GitHub's enterprise solutions, trials, or server offerings.",
        "Users aim to retrieve technical documentation for APIs, integrations, or development tools."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Tasks require searching for repositories using specific criteria (language, topic, stars, recency)",
        "Tasks involve comparing pricing plans (e.g., Free vs Pro vs Enterprise)",
        "Tasks require navigating documentation/help sections (e.g., creating projects, security guides)",
        "Tasks involve identifying repository metrics (stars, forks, contributors, recent activity)",
        "Tasks require locating customer stories/testimonials from specific companies",
        "Tasks involve finding feature details (Copilot capabilities, Advanced Security tools)",
        "Tasks require parsing security information (vulnerabilities, Dependabot, secret scanning)",
        "Tasks involve exploring GitHub product sections (Actions, Projects, Mobile)",
        "Tasks require interacting with API documentation (REST vs GraphQL comparisons)",
        "Tasks involve account/subscription management (plan upgrades, storage limits)"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Both datasets focus on retrieving real-time or recent sports scores and game details such as team matchups, quarter/period status, and final results.",
        "Tasks in both datasets require navigating through structured league sections (NBA, NFL, NCAA, etc.) with consistent team abbreviations and score formatting.",
        "Users must locate player/team statistics across multiple sports (e.g., points scored, win-loss records, standings) through standardized data presentation.",
        "Both involve finding schedule information including game dates/times, TV networks, and timezone-specific start times (PT/UTC).",
        "Navigation requires identifying active/clickable elements for gamecasts, box scores, and play-by-play updates across both datasets.",
        "Tasks demand comparison of team records (e.g., 30-5 vs. 31-4) and seasonal performance metrics within conference standings.",
        "Both datasets require parsing game status indicators like 'Final', 'Postponed', or in-progress clock updates (e.g., '7:08 - 2nd').",
        "Users must navigate betting information integration including spreads, moneylines, and over/under totals in both interfaces.",
        "Both involve locating multimedia content links (watch/streaming options, ESPN+ integration) alongside game data.",
        "Tasks require handling dynamic content updates for live games and time-sensitive information across multiple concurrent matchups."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Tasks require navigating to live scores and game results across multiple sports leagues (e.g., NBA, NFL, NCAA).",
        "Users must locate detailed player/team statistics (e.g., rebounds, points, standings) through structured data tables.",
        "Navigation involves accessing time-sensitive content like recent trades, injuries, or game schedules within 2-7 day timeframes.",
        "Tasks require interaction with sports news articles discussing analysis, predictions, or historical context (e.g., MVP candidates, Final Four previews).",
        "Users must identify/compare team rankings across conferences using conference-specific standings pages.",
        "Navigation paths involve filtering scores/games by date (e.g., 'yesterday's matchups', 'December 25, 2023').",
        "Tasks require handling dynamic content updates for ongoing games (e.g., quarter/half-time scores, live odds).",
        "Users must cross-reference multimedia content (podcasts, videos) with written stats/articles for comprehensive insights.",
        "Navigation includes fantasy sports integration (e.g., Tournament Challenge brackets, fantasy player stats).",
        "Tasks involve error recovery patterns for missing/unavailable data (e.g., handling final scores for in-progress games)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Tasks involve retrieving real-time or updated scores and game results across multiple sports leagues.",
        "Navigation requires accessing specific sections like standings, brackets, and player/team stats.",
        "Users must recognize and use common team abbreviations (e.g., LAC, NE, NYK) to locate information.",
        "Tasks frequently demand temporal filtering (e.g., dates, seasons, recent transactions) for accuracy.",
        "Both datasets emphasize cross-league coverage (NFL, NBA, NCAA, soccer) within a single platform.",
        "Queries often involve comparative analysis (e.g., player stats, team standings, odds comparisons).",
        "Integration with ESPN+ features or content is required for certain tasks (e.g., streaming, tools).",
        "Users navigate hierarchical structures (e.g., homepage \u2192 league \u2192 game \u2192 box score \u2192 player stats).",
        "Tasks necessitate understanding playoff formats, brackets, and postseason tracking (e.g., CFP, NBA Play-In).",
        "Search and filter functionalities are critical for locating time-bound articles, schedules, or odds."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Tasks involve retrieving real-time or recent sports scores and game results across leagues like NBA, NFL, NCAA, and NHL.",
        "Users must locate player/team statistics (e.g., points, standings, historical performance) from structured data modules.",
        "Navigation requires accessing league-specific sections (e.g., standings, schedules, brackets) via menus or search.",
        "Both datasets emphasize finding news articles/updates (e.g., trades, MVP candidates, injury reports) within ESPN\u2019s content ecosystem.",
        "Tasks involve comparing data across teams/players (e.g., stats, rosters, standings rankings).",
        "Users interact with multimedia/platform-specific features (e.g., ESPN+ streaming, podcasts, radio).",
        "Queries require parsing event details (e.g., game time, network broadcast info, score breakdowns).",
        "Tasks demand navigation through postseason/playoff-related content (e.g., Final Four, CFP, NFL playoffs).",
        "Users must filter/search for time-bound information (e.g., \u201cpast 2 days\u201d, \u201clatest\u201d, \u201c2024 season\u201d).",
        "Both datasets include cross-sport navigation (e.g., transitioning between college football, NBA, NHL content)."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Both datasets focus on retrieving real-time sports scores, schedules, and game summaries.",
        "Tasks require navigating to team-specific pages (e.g., Lakers, Bills, Celtics) for detailed information.",
        "Users must access player statistics, including performance metrics and injury reports.",
        "Both involve checking league standings (NBA, NFL, NCAA) and conference rankings.",
        "Tasks include filtering results by specific dates, weeks, or seasons (e.g., 'latest NBA trades in the past 2 days').",
        "Dynamic content interaction (live updates, playoffs, drafts) is central to navigation goals.",
        "Search functionality is critical for locating specific games, players, or news articles.",
        "Postseason information (playoff brackets, draft results, Final Four) is a common query target.",
        "Users compare team/player metrics across games (e.g., scores, standings, stats).",
        "Multi-step navigation through menus (e.g., 'Where to Watch', ESPN+ sections) is required for complex tasks."
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Tasks require searching/filtering models/datasets by specific attributes (license, modality, update date)",
        "Tasks involve extracting metadata like model size, framework, download counts, or GitHub stars",
        "Tasks require navigating documentation for API usage, training parameters, or library integration",
        "Tasks involve comparing enterprise vs. free tier features and pricing plans",
        "Tasks require identifying temporal information (latest updated models, recent papers/blog posts)",
        "Tasks involve license verification (Apache-2.0, CC-BY-SA-4.0, commercial use permissions)",
        "Tasks require interacting with community features (model likes, dataset followers, discussion forums)",
        "Tasks involve modality-specific searches (text-to-image, audio processing, video generation)",
        "Tasks require cross-referencing between models, datasets, papers, and GitHub repositories",
        "Tasks involve technical implementation details (model conversion, GPU requirements, inference optimization)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Tasks require navigating structured categories (Models, Datasets, Spaces)",
        "Users must locate and interpret model metadata (update dates, likes, downloads)",
        "License information retrieval is a common requirement",
        "Need to compare performance metrics across models/datasets",
        "Documentation search for specific technical implementations",
        "Pricing plan analysis for different account tiers",
        "Modality-specific model identification (text, image, audio)",
        "Enterprise feature exploration (security, support, SSO)",
        "Dataset content verification and format requirements",
        "API usage instructions for model implementation"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Tasks require retrieving technical specifications of models (name, size, framework, architecture)",
        "Navigation involves filtering/searching models by license type (Apache-2.0, CC-BY-SA-4.0)",
        "Tasks require comparing models based on quantitative metrics (download counts, likes, stars)",
        "Users need to locate temporal information (latest updates, model version dates)",
        "Tasks involve cross-referencing documentation for API usage (Trainer API, Inference API)",
        "Requires understanding of model deployment details (TensorFlow/PyTorch conversion, GPU requirements)",
        "Tasks demand analysis of organizational content (enterprise features, pricing tiers)",
        "Navigation patterns include dataset exploration (metadata, content previews, update history)",
        "Tasks require identifying specialized model capabilities (text-to-image, translation, NER)",
        "Users must verify compatibility information (language support, modality requirements)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Tasks require searching/filtering models by attributes (license, task, modality)",
        "Tasks involve retrieving technical specifications (model size, framework, tensor type)",
        "Tasks require extracting temporal information (last updated date, release date)",
        "Tasks involve comparing/ranking models by metrics (likes, downloads, stars)",
        "Tasks require navigation between related resources (models \u2194 datasets \u2194 documentation)",
        "Tasks involve license verification and compliance checks (Apache-2.0, CC-BY-SA)",
        "Tasks require understanding pricing tiers and enterprise features (Pro account, GPU costs)",
        "Tasks involve cross-referencing model cards with research papers/technical documentation",
        "Tasks require interpretation of model capabilities from metadata (supported modalities, languages)",
        "Tasks involve API interaction scenarios (Inference API, Trainer API, Transformers.js)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Tasks require searching/filtering models/datasets by attributes like license type, update date, or task type",
        "Navigation involves extracting model metadata (name, size, framework, download counts)",
        "Users need to locate documentation for API usage (Trainer API, Inference API, model conversion)",
        "Tasks involve comparing/identifying models with highest metrics (likes, downloads, stars)",
        "Requires navigating pricing/enterprise sections to compare plans and features",
        "Tasks demand finding latest updated models/datasets through timestamps",
        "Users must parse technical specifications from model/dataset cards (architecture, use cases)",
        "Navigation paths include exploring open-source libraries (Transformers, Diffusers, PEFT)",
        "Tasks require cross-referencing model capabilities with documentation examples",
        "Involves filtering content by modality (text, image, audio) and technical requirements"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Tasks involve searching for courses by subject area (e.g., Data Science, Python, Project Management)",
        "Navigation requires filtering by skill level (beginner/intermediate)",
        "Users need to identify course providers/institutions (e.g., Google, IBM, Stanford)",
        "Tasks require checking course duration and time commitment estimates",
        "Both datasets include certificate verification requirements for course completion",
        "Users must locate instructor information and associated credentials",
        "Tasks involve comparing career outcomes (salary ranges, job availability)",
        "Navigation includes exploring degree programs and their admission requirements",
        "Both require identifying free vs paid course options",
        "Tasks demand checking course ratings and review statistics"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Tasks require searching/filtering courses by specific subjects (e.g., Data Science, Python, Cybersecurity).",
        "Users need to identify course levels (beginner/intermediate) and prerequisite requirements.",
        "Tasks involve extracting instructor details including names, bios, and other courses taught.",
        "Queries demand verification of certification availability upon course completion.",
        "Navigation requires checking course/provider partnerships (universities like Yale, companies like Google).",
        "Tasks require comparing course durations and time commitments (e.g., hours/week, total weeks).",
        "Users must locate skill-based course components (e.g., Agile methodology, AI ethics modules).",
        "Queries involve validating ratings/reviews (e.g., minimum 4.5 stars, participant feedback).",
        "Tasks require identifying career-aligned credentials (e.g., Professional Certificates, Specializations).",
        "Navigation includes accessing external resources (e.g., learner outcome reports, university deadlines)."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Tasks involve searching for courses by specific topics (e.g., AI, Python, Sustainability) with granular filters like skill level, duration, or subject area.",
        "Users frequently seek instructor details, including names, bios, and other courses taught by the same instructor.",
        "Tasks require identifying course structure elements such as modules (e.g., 'Agile methodology'), video counts, or learning outcomes.",
        "Queries target certifications or credentials (e.g., Professional Certificates, Specializations) and their requirements or benefits.",
        "Users prioritize course ratings (e.g., 4.5+ stars) and reviews as decision-making criteria.",
        "Tasks involve filtering courses by partner institutions (e.g., Google, IBM, Stanford) or regional affiliations (e.g., Australian universities).",
        "Navigation includes verifying free vs. paid offerings, trial access, or enrollment prerequisites.",
        "Users compare career-related metrics like median salaries, job availability, or skill applicability for roles (e.g., Data Analyst, Cybersecurity Analyst).",
        "Queries focus on degree programs (e.g., Master\u2019s deadlines, credit policies) and their alignment with career goals.",
        "Tasks require extracting metadata such as course durations (e.g., 1-3 months) or weekly time commitments (e.g., 5 hours/week)."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Tasks involve searching for courses by specific technical skills (e.g., Python, data analytics, cybersecurity).",
        "Users frequently filter courses by difficulty level (e.g., beginner, intermediate).",
        "Queries require identifying course attributes like duration, weekly time commitment, and completion certificates.",
        "Tasks often include verifying institutional affiliations (e.g., Google, IBM, Stanford) of courses.",
        "Navigation requires extracting instructor details, including biographies and other courses taught.",
        "Users prioritize courses with high ratings (e.g., 4.5+ stars) and reviews.",
        "Tasks involve comparing Professional Certificates vs. Specializations vs. Degree programs.",
        "Queries focus on career-aligned skills (e.g., job-ready certifications, salary data, role-specific training).",
        "Users frequently seek free courses or financial promotions (e.g., Coursera Plus discounts).",
        "Tasks require filtering by subject categories (e.g., Business, Computer Science, Health)."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Tasks require filtering courses by skill level (beginner/intermediate)",
        "Both involve searching for Professional Certificates/Specializations from recognized institutions",
        "Navigation includes identifying partner universities and companies (Google, IBM, universities)",
        "Tasks require checking course credentials and certification availability",
        "Both datasets involve comparing program durations and weekly time commitments",
        "Users must locate specific course modules/curriculum components (e.g., Agile, IoT)",
        "Tasks require extracting instructor credentials and related course offerings",
        "Both involve sorting/filtering by course ratings (4+ stars) and review counts",
        "Navigation includes identifying degree programs and admission requirements",
        "Tasks require differentiating between free courses vs. paid certificates"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Both datasets require search functionality with field-specific filters (title, author, abstract, etc.)",
        "Tasks involve navigating hierarchical subject categories with subcategory drill-down capabilities",
        "Users need to access recent paper listings within specific date ranges in both datasets",
        "Both require understanding of arXiv's category-specific identifiers (e.g., astro-ph.EP, quant-ph)",
        "Tasks involve extracting specific metadata elements (author names, affiliations, submission dates)",
        "Both datasets require interaction with advanced search parameters and Boolean operators",
        "Users need to distinguish between different publication formats (HTML vs PDF) in both datasets",
        "Tasks require cross-referencing between paper abstracts and detailed content sections",
        "Both involve handling special search syntax for journal references and arXiv identifiers",
        "Users must navigate between paper metadata and institutional/organizational author affiliations"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Both datasets focus on academic paper search and retrieval tasks across multiple scientific disciplines",
        "Tasks require navigation through hierarchical subject categories (e.g., Physics -> Condensed Matter -> Mesoscale Physics)",
        "Users need to utilize advanced search filters including date ranges, categories, and author names",
        "Tasks involve extracting specific paper components (abstracts, author affiliations, reference sections)",
        "Both require understanding of arXiv's categorization system and subfield identifiers (e.g., cs.LG for machine learning)",
        "Tasks demand handling of paper versions and formats (PDF, HTML, source code downloads)",
        "Users must navigate supplemental materials and related paper recommendations",
        "Both involve working with technical domain-specific terminology across STEM fields",
        "Tasks require cross-referencing between search results and paper metadata",
        "Users need to interpret academic citation formats and arXiv identifiers"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Both datasets require users to perform academic paper searches using specific keywords and categories",
        "Tasks involve filtering results by date ranges (e.g., last week, within last two days)",
        "Users need to navigate through hierarchical subject categories (e.g., Physics > Astrophysics)",
        "Both require extraction of specific metadata elements (titles, authors, abstracts)",
        "Tasks involve comparing search results across different categories/archives",
        "Users must locate and interpret submission guidelines/policies (e.g., multi-language requirements)",
        "Both datasets require handling paper versions/formats (HTML vs PDF access)",
        "Tasks involve affiliation tracking and author identification features",
        "Users need to utilize advanced search operators (journal references, date ranges)",
        "Both require navigation through help documentation and support resources"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Search functionality includes keyword-based queries across multiple fields (title, abstract, comments)",
        "Requires navigation through hierarchical subject categories and subcategories (e.g., Physics \u2192 Condensed Matter)",
        "Tasks involve date filtering for recent submissions (last week, last two days)",
        "Need to handle special search syntax for journal references and identifiers (arXiv ID, DOI)",
        "Requires understanding of multi-format document access (HTML, PDF, source downloads)",
        "Contains queries about author affiliations and collaborative institutions",
        "Involves cross-category searching (e.g., machine learning in both Computer Science and Statistics)",
        "Requires interpretation of paper metadata (author lists, submission dates, version history)",
        "Tasks demand understanding of arXiv's subject classification schema granularity",
        "Includes operational queries about submission requirements and policy compliance"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Tasks require searching scholarly articles by specific keywords or phrases within academic categories",
        "Navigation involves filtering results by subject categories and subcategories (e.g., Physics \u2192 Astrophysics)",
        "Queries require understanding of hierarchical academic taxonomies (archive \u2192 category \u2192 subcategory)",
        "Tasks involve temporal filtering (e.g., 'last week', 'last two days', date ranges)",
        "Users need to extract metadata: titles, authors, abstracts, and publication dates from results",
        "Actions require distinguishing between multiple search scopes (category-specific vs. cross-archive searches)",
        "Tasks demand handling author name variations and affiliation identification",
        "Queries involve format-specific retrieval (PDF, HTML, TeX source)",
        "Navigation requires access to specialized filters: journal references, arXiv IDs, and publication status",
        "Tasks involve cross-referencing between papers through citations and bibliographies"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Both datasets require navigating hierarchical category structures (e.g., World > Middle East, Sport > Football)",
        "Tasks involve locating time-sensitive content marked with relative timestamps (e.g., 'X hrs ago')",
        "Users must identify and parse article components like headlines, summaries, and metadata tags",
        "Navigation requires interaction with standardized section headers (Business, Culture, Asia, etc.)",
        "Tasks demand differentiation between news formats (articles vs videos vs live updates)",
        "Both involve filtering content through geographical tags (UK, US & Canada, Middle East)",
        "Users must handle content organization patterns like 'MOST READ' and 'MORE TO EXPLORE' sections",
        "Tasks require parsing multi-part content cards containing images, headlines, and excerpts",
        "Both datasets involve cross-referencing category labels with timestamps for recency verification",
        "Navigation patterns require understanding persistent header elements (menu buttons, search, account links)"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Articles are organized into region-specific sections (e.g., Asia, Middle East, Europe).",
        "Timestamps indicating time elapsed since publication are consistently displayed.",
        "Multimedia elements (images, videos) are embedded within article previews.",
        "Primary navigation includes categorized sections like News, Sport, Business, and Culture.",
        "Search functionality is implied for locating specific topics or articles.",
        "Live updates or live coverage sections are available for real-time events.",
        "Dynamic content blocks (e.g., 'Most Read,' 'Most Watched') highlight trending articles.",
        "Articles are tagged with thematic categories (e.g., Climate, Innovation, US & Canada).",
        "Metadata such as geographic relevance and topic tags accompany article summaries.",
        "Structured article previews include headlines, brief summaries, and contextual hyperlinks."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Both datasets require navigation through categorized sections (e.g., World News, Business, Sport, Middle East) to locate articles.",
        "Tasks involve identifying timestamps (e.g., '3 hrs ago') to determine content recency.",
        "Users must summarize key points from structured articles with headings, paragraphs, and multimedia elements.",
        "Regional subsections (e.g., Asia, Europe, US & Canada) are critical for localized news retrieval.",
        "Live updates (e.g., 'LIVE' coverage) are featured for real-time event tracking.",
        "Multimedia content (images, videos) is integral to article comprehension and task completion.",
        "Both include hierarchical topic categorization (e.g., 'Gaza ceasefire', 'Trump tariffs') for thematic browsing.",
        "Time-sensitive tasks rely on consistent article metadata (category labels, publication dates).",
        "Lists like 'Most Read' or 'Most Watched' streamline access to trending content.",
        "Navigation tasks depend on structured menus (e.g., header links, footer navigation) for section access."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Both datasets require navigating through categorized news sections (e.g., World, US & Canada, Asia, Business) to locate specific articles or summaries.",
        "Tasks in both datasets involve extracting time-sensitive information (e.g., publication timestamps like '3 hrs ago' or '1 day ago') from article metadata.",
        "Both require identifying and interacting with multimedia content (images/videos) within news stories through descriptive captions or embedded links.",
        "Navigation tasks in both datasets depend on hierarchical menu structures with primary categories (News, Sport, Culture) and subcategories (regional divisions, specific topics).",
        "Both involve locating articles through section-specific filtering (e.g., finding Technology news within Business section or regional news in Middle East/Asia categories).",
        "Tasks require parsing article previews containing headlines, brief summaries, and category tags to identify relevant content.",
        "Both datasets feature trending content sections ('MOST READ', 'MOST WATCHED') that users must navigate to find popular stories.",
        "Navigation involves using consistent timestamp patterns (e.g., 'X hrs ago') to identify recency of news across both datasets.",
        "Both require distinguishing between live updates (LIVE labels) and standard articles through visual/textual cues in content previews.",
        "Tasks in both datasets involve footer navigation to access ancillary content (Terms of Use, Privacy Policy) through standardized link structures."
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks require locating and summarizing recent articles on specific topics (e.g., natural disasters, tech layoffs, sports events).",
        "Navigation involves filtering content by geographic regions (e.g., Asia, Middle East, Europe, US & Canada).",
        "Users must identify timestamps (e.g., 'hrs ago', publication dates) to verify content recency.",
        "Interaction with multimedia elements like images/videos in stories is required (e.g., 'first picture', 'PodCasts').",
        "Tasks demand browsing hierarchical categories (e.g., Sport \u2192 Football \u2192 Scottish Premiership, Business \u2192 Technology).",
        "Users need to cross-reference multiple sections (e.g., linking Weather reports to regional news).",
        "Tasks involve extracting structured data from articles (e.g., leaderboard rankings, event start times).",
        "Navigation requires distinguishing between news formats (e.g., live updates, in-depth analyses, video reports).",
        "Users must parse article metadata (e.g., author names, content tags like 'BBC InDepth').",
        "Tasks emphasize verifying context through related links (e.g., 'OTHER TOP STORIES', 'MOST READ' sections)."
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Tasks require keyword-based search queries with product-specific attributes (e.g., price ranges, material specifications)",
        "Navigation involves filtering results by price constraints across both datasets",
        "Both require comparison of multiple products based on specified criteria",
        "Users need to verify customer review thresholds (4+ stars) in both datasets",
        "Tasks involve checking availability of free shipping options",
        "Both require interaction with category-specific filters (e.g., electronics, home goods)",
        "Price sorting functionality (low-high/high-low) is essential in both datasets",
        "Product availability checks based on size/color variations appear in both",
        "Both involve locating time-sensitive deals/offers with percentage discounts",
        "Tasks require understanding of product specifications from descriptions"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks involve searching and filtering products by specific attributes (e.g., price range, category, material).",
        "Users frequently utilize price constraints (e.g., 'under $50', 'cheapest') to narrow results.",
        "Navigation requires interaction with product categories (e.g., electronics, home goods, fashion).",
        "Tasks include adding items to cart or verifying cart actions as a key step.",
        "Sorting mechanisms (e.g., price high-to-low, customer ratings) are critical for task completion.",
        "Product specifications (e.g., dimensions, battery life, capacity) are central to decision-making.",
        "Free shipping availability and delivery options are common user requirements.",
        "Customer review thresholds (e.g., '4+ stars') are used to filter quality products.",
        "Deal identification and comparison (e.g., percentage discounts, limited-time offers) are recurring objectives.",
        "Attribute-based filtering (e.g., size, color, waterproofing) is essential across multiple product types."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks require product search with specific filters (price, category, ratings).",
        "Users need to interact with shopping cart features (add items, check availability).",
        "Navigation involves comparing prices across multiple product listings.",
        "Tasks include sorting products by attributes like price, popularity, or release date.",
        "Users must verify product details (material, size, compatibility, features).",
        "Tasks require filtering results by delivery options (FREE shipping, Prime).",
        "Navigation involves accessing and interpreting customer reviews/ratings.",
        "Users need to browse hierarchical categories (e.g., Electronics > Accessories > Gaming).",
        "Tasks require identification of promotional deals or seasonal offers.",
        "Navigation includes handling multi-step workflows (search > filter > compare > select)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks require filtering products by price range (e.g., under $50, between $40-$60).",
        "Navigation includes category-specific searches (e.g., electronics, home decor, fashion).",
        "Users must validate product attributes (e.g., waterproof, RFID blocking, eco-friendly).",
        "Tasks involve sorting results (e.g., price low-to-high, customer ratings).",
        "Actions require checking customer review thresholds (e.g., 4+ stars, 500+ reviews).",
        "Users verify availability constraints (e.g., free shipping, delivery eligibility).",
        "Tasks involve adding items to cart or saving search results.",
        "Deal identification is required (e.g., percentage discounts, seasonal offers).",
        "Users compare multiple products or prioritize top search results.",
        "Navigation includes hierarchical exploration (e.g., department > subcategory > product)."
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Tasks involve searching for products with specific attributes (e.g., price range, material, functionality).",
        "Users are required to filter results based on criteria like price, ratings, or product features.",
        "Tasks include comparing prices or sorting results (e.g., cheapest, highest-rated).",
        "Navigation includes adding items to cart or verifying delivery/shipping options (e.g., free shipping).",
        "Users must verify product availability (e.g., size, color, stock status).",
        "Tasks require locating deals, promotions, or seasonal sales (e.g., Winter Sale, Prime eligibility).",
        "Instructions involve browsing categories (e.g., electronics, fashion, home goods) to find items.",
        "Users are prompted to check customer reviews or ratings (e.g., 4+ stars, 50+ reviews).",
        "Tasks focus on identifying product specifications (e.g., dimensions, compatibility, energy efficiency).",
        "Actions include purchasing gift cards or selecting items for gifting (e.g., graduation, holidays)."
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Tasks involve mathematical problem-solving with equations and computations.",
        "Queries require scientific calculations in physics, chemistry, and engineering.",
        "Requests for unit conversions and measurements (e.g., mass, energy, time).",
        "Tasks involve data analysis, including statistical distributions and properties.",
        "Navigation includes solving differential equations and algebraic expressions.",
        "Queries demand step-by-step solutions for complex problems.",
        "Tasks involve chemical reactions, molecular properties, or material science.",
        "Requests for real-world data (e.g., financial metrics, geopolitical information).",
        "Navigation includes plotting or visualizing mathematical functions and curves.",
        "Tasks leverage Wolfram Alpha's domain-specific knowledge (e.g., astronomy, thermodynamics)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Tasks require computational or scientific calculations (e.g., unit conversions, equation solving, data analysis).",
        "Tasks demand step-by-step problem-solving or procedural explanations (e.g., differential equations, algebraic simplifications).",
        "Tasks span multiple STEM domains (e.g., chemistry, physics, mathematics, engineering).",
        "Tasks involve data interpretation or comparison (e.g., nutritional comparisons, statistical properties).",
        "Tasks focus on real-world applications (e.g., financial calculations, energy output, health metrics).",
        "Tasks require visualization of mathematical or scientific concepts (e.g., plotting curves, derivatives).",
        "Tasks involve retrieving definitions, properties, or explanations of concepts (e.g., Unicode characters, Riemann Hypothesis).",
        "Tasks rely on mathematical computations (e.g., integrals, derivatives, algebraic equations).",
        "Tasks necessitate access to specialized knowledge bases (e.g., material properties, historical data).",
        "Tasks include exploratory queries to test platform capabilities (e.g., language features, computational tools)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Tasks require computational or mathematical problem-solving using Wolfram Alpha's engine.",
        "Queries involve unit conversions (e.g., mass to moles, temperature anomalies).",
        "Tasks demand domain-specific knowledge (e.g., physics, chemistry, finance, statistics).",
        "Navigation includes accessing step-by-step solutions for equations or proofs.",
        "Tasks involve data retrieval from structured scientific or technical databases.",
        "Queries focus on plotting or visualizing mathematical functions or datasets.",
        "Tasks require parsing natural language inputs into computational queries.",
        "Navigation includes exploring predefined topic categories (e.g., Mathematics, Science & Technology).",
        "Queries involve solving differential equations, integrals, or polynomial expansions.",
        "Tasks demand comparison of quantitative results (e.g., nutritional data, material properties)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Tasks require computational or mathematical problem-solving capabilities",
        "Queries involve unit conversions and scientific calculations",
        "Navigation includes accessing specialized academic/scientific data (e.g. chemistry, physics)",
        "Tasks demand interaction with equation-solving features",
        "Users seek graphical/numerical results for STEM concepts",
        "Queries require data retrieval from curated knowledge bases",
        "Tasks involve comparison of scientific/technical properties",
        "Navigation patterns include accessing step-by-step solutions",
        "Queries target domain-specific terminology and definitions",
        "Tasks utilize Wolfram Alpha's proprietary algorithms for complex computations"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Tasks require computational problem-solving across mathematics, science, and engineering domains.",
        "Queries involve structured inputs with precise parameters (e.g., units, formulas, physical constants).",
        "Tasks demand integration of natural language understanding with symbolic computation (e.g., parsing equations from text).",
        "Navigation goals include retrieval of domain-specific data (e.g., chemical properties, financial metrics, astronomical values).",
        "Tasks frequently require unit conversions (e.g., mass to moles, metric to non-metric systems).",
        "Queries involve differential equations, algebraic manipulations, and calculus operations.",
        "Tasks utilize Wolfram Alpha's capability for multi-step solutions (e.g., showing intermediate steps for equation solving).",
        "Navigation goals include comparative analysis (e.g., material properties, financial comparisons).",
        "Tasks leverage Wolfram Alpha's structured knowledge base for scientific constants and real-world datasets.",
        "Queries frequently require visualization outputs (e.g., plots, geometric representations) alongside numerical results."
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Tasks require filtering recipes by user ratings (e.g., 4 stars or higher).",
        "Navigation includes searching for recipes with specific ingredient constraints (e.g., chicken breast, quinoa).",
        "Users target recipes with minimum review thresholds (e.g., 50+ reviews).",
        "Tasks involve retrieving nutritional information (e.g., carbs, calories per serving).",
        "Recipes are filtered by preparation/cooking time constraints (e.g., under 30 minutes).",
        "Dietary preferences (e.g., vegetarian, keto, gluten-free) are common search criteria.",
        "Tasks prioritize popular or highly-rated recipes (e.g., \"Most Popular Recipes of the 1960s\").",
        "Recipes are categorized by meal type (e.g., dinners, snacks, desserts, brunch).",
        "User-generated content (e.g., reviews, ratings) is integral to recipe evaluation.",
        "Tasks involve locating seasonal/holiday-specific recipes (e.g., Easter, Christmas)."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks require searching for recipes with specific dietary constraints (e.g., vegetarian, keto, low-carb).",
        "Users frequently filter recipes by review count, star ratings, and preparation time.",
        "Navigation involves accessing detailed recipe information (ingredients, steps, nutrition facts).",
        "Tasks emphasize locating highly-rated recipes (4 stars or higher) with substantial user reviews.",
        "Users interact with recipe categories (e.g., dinners, desserts, cuisines) for browsing.",
        "Tasks involve saving/bookmarking recipes for future reference or meal planning.",
        "Users seek seasonal/holiday-specific recipes (e.g., Easter, Christmas, Ramadan).",
        "Tasks require comparing multiple recipes to identify optimal options (e.g., ingredient variety, cook time).",
        "Users prioritize recipes with nutritional metadata (calories, carbs) for health-conscious choices.",
        "Tasks include user-generated content interactions (e.g., leaving reviews, modifying recipes)."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Tasks require searching for recipes with explicit rating thresholds (e.g. 4+ stars)",
        "Users filter results by minimum review counts (e.g. 50+ reviews)",
        "Queries specify dietary requirements like vegetarian/vegan/keto/low-carb",
        "Tasks demand extraction of precise preparation/cook time parameters",
        "Nutritional information requests focus on macros (carbs/protein/calories)",
        "Ingredient-specific requirements appear in both datasets (e.g. zucchini, almond flour)",
        "Category navigation patterns exist (Dinners > Vegetarian > Italian)",
        "Comparison tasks between multiple recipes based on ratings/reviews",
        "Recipe saving/bookmarking actions are present in both datasets",
        "Holiday/occasion-specific recipe searches occur in both (Easter/Christmas/New Years)"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Search functionality present with text input and button for recipe queries",
        "Recipe categories include 'DINNERS', 'MEALS', 'INGREDIENTS', and 'OCCASIONS'",
        "User reviews and star ratings displayed prominently for recipe evaluation",
        "Recipe saving/bookmarking capability with 'Save Recipe' buttons",
        "Nutritional information requirements specified in task objectives",
        "Filtering by dietary constraints (low-carb, vegetarian, keto) in search tasks",
        "Seasonal/holiday-specific recipe recommendations (Easter, Christmas)",
        "User-generated content interaction (recipe reviews and ratings)",
        "Popular recipe suggestions with high review counts as quality indicators",
        "Structured navigation through header menus and footer links for content discovery"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks require filtering recipes by user ratings (e.g., 4 stars or higher)",
        "Navigation involves searching for recipes with specific ingredient combinations (e.g., chicken breast + quinoa)",
        "Users must locate recipes with minimum review thresholds (e.g., 50+ reviews)",
        "Tasks require extraction of preparation/cooking time details from recipes",
        "Nutritional information retrieval is required (e.g., carb content per serving)",
        "Recipe category navigation needed (e.g., vegetarian, keto, gluten-free)",
        "Tasks involve comparing popularity metrics (e.g., 'most saved Easter recipe')",
        "Users must identify seasonal/occasion-specific recipes (e.g., Thanksgiving desserts)",
        "Recipe storage/preservation instructions are required (e.g., sushi roll storage)",
        "Tasks require cross-referencing multiple recipe attributes (ratings+time+ingredients)"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Both datasets require users to search for word definitions using a prominently placed search bar.",
        "Tasks in both datasets involve navigating to specific sections (e.g., Grammar, Thesaurus) via the main navigation menu.",
        "Users must locate and interpret pronunciation guides (including UK/US variants) for queried words.",
        "Both include tasks requiring access to translation features for converting words between languages (e.g., English to Chinese).",
        "Tasks involve interacting with example sentences provided alongside definitions.",
        "Both datasets require users to compare multiple dictionary entries (e.g., learner\u2019s dictionary vs. essential English variants).",
        "Navigation to the Word Scramble game in the '+Plus' section is present in tasks from both datasets.",
        "Users must identify synonyms and antonyms through the Thesaurus interface in both datasets.",
        "Tasks involve parsing grammatical explanations (e.g., passive voice, articles) from structured grammar guides.",
        "Both datasets include language switching tasks (e.g., changing from English to Deutsch) via a language selector."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Tasks require users to search for word definitions using a search bar interface.",
        "Tasks involve accessing specific sections like Grammar, Thesaurus, and Translate through navigational links.",
        "Tasks require users to retrieve pronunciation guides (both UK and US variants) for words.",
        "Tasks involve finding example sentences demonstrating word usage in context.",
        "Tasks require users to utilize translation features between English and other languages (e.g., Chinese, French, German).",
        "Tasks include exploring grammar rules (e.g., passive voice, comparative adjectives) via dedicated grammar sections.",
        "Tasks involve synonym/antonym lookup using the Thesaurus functionality.",
        "Tasks require interaction with supplementary content like the Word of the Day or blog posts.",
        "Tasks involve navigating language settings (e.g., switching to Deutsch) via a language selection panel.",
        "Tasks include accessing interactive elements like the Word Scramble game in the Plus section."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks require searching for word definitions in an English dictionary context",
        "Tasks involve checking word pronunciations with options for both UK and US variants",
        "Tasks necessitate navigating to grammar explanations and usage examples",
        "Tasks include translating words between English and other languages (e.g. Chinese, French)",
        "Tasks require using the Thesaurus to find synonyms and related terms",
        "Tasks involve interacting with pronunciation audio features (speaker icons)",
        "Tasks require understanding different dictionary sections (Learner's Dictionary, Essential variants)",
        "Tasks involve accessing blog content for extended language learning resources",
        "Tasks require distinguishing between dictionary versions (English-English vs bilingual)",
        "Tasks involve exploring word game features (Word Scramble) for vocabulary practice"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Tasks require searching for word definitions with detailed entries including multiple aspects (meaning, pronunciation, examples).",
        "Navigation involves accessing grammar guides for explanations of linguistic rules (e.g., passive voice, articles).",
        "Users must locate UK/US pronunciation guides, often with audio playback buttons.",
        "Tasks involve synonym retrieval via Thesaurus integration for given words/phrases.",
        "Translation functionality is utilized for converting words/phrases between languages (e.g., English\u2013Chinese/Spanish).",
        "Interactive elements like word games (e.g., Word Scramble) are included in task workflows.",
        "Blog content navigation is required to find usage examples or linguistic explanations.",
        "Example sentence extraction is a recurring objective across word lookup tasks.",
        "Language settings adjustments (e.g., switching dictionary language direction) are part of task execution.",
        "Social media sharing features (e.g., Facebook, Twitter/X) are integrated into word entry workflows."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Tasks involve searching for word definitions, pronunciations, and example sentences in both datasets.",
        "Both datasets require navigating to Grammar sections for explanations and usage rules.",
        "Translation tasks between English and other languages (e.g., Chinese, Spanish) are present in both.",
        "Pronunciation comparisons (UK vs. US) are a common feature across tasks.",
        "Use of the Thesaurus for synonyms and antonyms is required in both datasets.",
        "Tasks involve accessing structured categories like Dictionary, Grammar, and Translate via navigation menus.",
        "Both datasets include tasks requiring interaction with language selection to switch dictionaries (e.g., English\u2013French, Deutsch).",
        "Example sentences are frequently requested to contextualize word usage in both datasets.",
        "Tasks in both datasets reference navigating to the Plus section for features like games (e.g., Word Scramble).",
        "Both datasets require understanding grammatical concepts (e.g., adjectives, adverbs, indirect speech) with examples."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Tasks involve locating product specifications (e.g., camera features, battery life, technical details) on Apple devices.",
        "Navigation requires identifying pricing details for specific product models and configurations.",
        "Users must compare features, models, or versions of products (e.g., iPhone 16 Pro vs. Pro Max, AirPods variants).",
        "Tasks include locating trade-in values, recycling programs, or Apple Trade In workflows.",
        "Activities involve troubleshooting or support-related queries (e.g., battery life fixes, password recovery).",
        "Navigation targets product availability, in-store pickup options, or shipping details.",
        "Tasks require identifying compatibility information (e.g., device accessories, software requirements).",
        "Users must locate Apple\u2019s environmental initiatives or carbon-neutral product details.",
        "Activities involve exploring Apple services (e.g., Apple Music, Apple TV+, App Store Awards).",
        "Tasks include navigating account management features (e.g., Apple ID, Family Sharing, Business Manager)."
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks focus on locating product specifications (e.g., technical details, dimensions, camera capabilities)",
        "Price comparison across models/configurations is a central requirement in both datasets",
        "Navigation involves identifying trade-in values and purchase upgrade paths",
        "Tasks require checking product release dates/regional availability",
        "Both involve troubleshooting/support queries (e.g., warranty checks, battery issues)",
        "Product customization tasks appear (storage/color selection, accessory pairing)",
        "Environmental impact/corporate responsibility information is sought in some tasks",
        "Requires navigation through multi-level product category menus (e.g., iPhone > Pro models)",
        "Accessory compatibility/pricing verification appears in multiple samples",
        "Tasks involve finding family/education-specific offers and account management features"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks focus on locating product specifications (e.g., price, storage, dimensions, color options).",
        "Navigation requires interaction with product categories (e.g., iPhone, Mac, iPad, Apple Watch).",
        "Users must compare features or prices across multiple product models or generations.",
        "Tasks involve identifying release dates, availability status, or regional restrictions.",
        "Action-oriented goals include purchasing, trade-in valuation, or upgrade configurations.",
        "Navigation paths require accessing support pages (e.g., password recovery, setup guides).",
        "Tasks frequently target Apple-specific services (e.g., Apple Card, Apple Trade In, subscriptions).",
        "Users must parse technical details (e.g., camera specs, chip performance, display features).",
        "Tasks involve cross-referencing product compatibility (e.g., iOS versions, device pairings).",
        "Navigation includes accessing educational/business-focused sections (e.g., discounts, enterprise use cases)."
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Tasks require locating product specifications (e.g., camera specs, dimensions, storage options)",
        "Users must compare features/prices across multiple product variants or generations",
        "Navigation involves accessing detailed technical information from product description pages",
        "Tasks require understanding hierarchical product categorization (e.g., iPhone models > storage options > color variants)",
        "Users must differentiate between main product listings and accessory/related service offerings",
        "Tasks involve price calculation through configuration customization (storage, accessories, etc.)",
        "Navigation requires understanding of Apple's product release cycle terminology (Pro/Max/Plus designations)",
        "Users must locate and interpret trade-in/value estimation workflows",
        "Tasks require cross-referencing between product marketing pages and support documentation",
        "Navigation patterns involve alternating between store pages and technical specification resources"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks require navigating product pages to find specifications (e.g., camera details, storage capacity, display size).",
        "Users must locate pricing information for specific product configurations (e.g., storage tiers, color variants).",
        "Tasks involve comparing features or specifications across multiple product generations (e.g., iPhone 13 Pro vs. 15 Pro).",
        "Navigation includes identifying release dates, availability, or regional launch details for devices.",
        "Users must access trade-in value calculators or promotional offers for older devices.",
        "Tasks require interacting with technical support pages (e.g., warranty checks, troubleshooting guides).",
        "Navigation focuses on product customization flows (e.g., configuring storage, accessories, carrier options).",
        "Users need to validate compatibility details (e.g., iOS version requirements, device interoperability).",
        "Tasks involve locating purchase-related actions like in-store pickup scheduling or online ordering.",
        "Navigation includes cross-referencing product categories (e.g., MacBooks for business vs. personal use) for targeted information."
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks require searching for specific, factual information using search engine queries",
        "Tasks involve retrieving up-to-date or real-time data (e.g., latest scores, current prices, recent news)",
        "Tasks frequently require multi-step navigation (search \u2192 filter \u2192 extract)",
        "Tasks span multiple domains: technology, science, entertainment, sports, and current events",
        "Tasks emphasize finding time-sensitive information (latest releases, recent events, current statistics)",
        "Tasks often require comparative analysis (e.g., comparing products, features, or performance metrics)",
        "Tasks frequently involve location-based specifications (geographic requirements, local venues, regional data)",
        "Tasks commonly seek technical specifications or system requirements (hardware/software compatibility)",
        "Tasks require identification of authoritative sources (official websites, academic papers, verified platforms)",
        "Tasks demand extraction of structured data from unstructured content (numbers, dates, names, rankings)"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Tasks require precise keyword-based search queries to locate specific information (e.g., technical terms, names, dates).",
        "Navigation involves extracting factual data (e.g., dates, statistics, definitions) from search results or linked pages.",
        "Users must identify and interact with authoritative sources (e.g., official websites, academic articles, verified platforms).",
        "Tasks often demand filtering results by recency (e.g., 'latest,' '2025,' 'current').",
        "Multi-step actions are required, such as searching, navigating to a subpage, then extracting details.",
        "Queries involve domain-specific terminology (e.g., 'SHA,' 'SEO,' 'super-earth planets').",
        "Tasks prioritize structured data parsing (e.g., tables, lists, rankings) from results.",
        "Users need to distinguish between primary content and ancillary links (e.g., ads, related articles).",
        "Many tasks require comparative analysis (e.g., rankings, stock prices, recipe ratings).",
        "Tasks assume familiarity with common web platforms (e.g., GitHub, IMDb, Google services) for targeted navigation."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Tasks require locating and extracting specific factual information from search results",
        "Queries involve time-sensitive data retrieval (e.g., latest, recent, current information)",
        "Tasks require navigating through multiple information layers (search results \u2192 specific pages \u2192 detailed answers)",
        "Contains technical terminology searches across domains (AI, science, engineering, healthcare)",
        "Requires identification of authoritative sources within search results (official websites, research papers)",
        "Tasks demand parsing structured data formats (tables, lists, technical specifications)",
        "Contains comparative analysis requirements (rankings, performance metrics, feature comparisons)",
        "Requires handling compound queries with multiple sub-tasks",
        "Involves interpretation of domain-specific metrics (scientific measurements, financial data, sports statistics)",
        "Tasks frequently require cross-referencing multiple information sources for verification"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "All tasks require formulating and executing specific search queries to locate information.",
        "Each task demands extraction of precise, factual data (e.g., dates, prices, names) from web content.",
        "Tasks involve parsing search engine results to identify relevant links, articles, or resources.",
        "All require keyword-driven search strategies to optimize relevance and accuracy of results.",
        "Tasks focus on retrieving objective, verifiable answers rather than subjective opinions.",
        "Navigation often involves interacting with common UI elements (e.g., search bars, filters, buttons) to refine results.",
        "Most tasks require up-to-date or time-sensitive information (e.g., latest news, recent prices, current events).",
        "Tasks frequently involve multi-step processes (e.g., search \u2192 filter \u2192 extract \u2192 verify).",
        "All tasks prioritize actionable outcomes (e.g., copying data, booking services, listing results).",
        "Tasks often require cross-referencing information from multiple sources or webpage sections for validation."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Tasks require extracting specific factual information from web content",
        "Navigation involves interacting with search functionality to locate answers",
        "Tasks frequently involve finding recent/current data (e.g., latest scores, newest releases)",
        "Multi-step actions required (search, identify, then extract/compare information)",
        "Focus on structured content parsing (tables, lists, specifications, rankings)",
        "Requires identification of authoritative sources for technical/medical information",
        "Tasks demand understanding of hierarchical information architecture (categories->subcategories->details)",
        "Common need to compare multiple entities/products/concepts within search results",
        "Frequent interaction with dynamic content (news updates, stock prices, event schedules)",
        "Emphasis on precision in information retrieval (exact dates, numerical data, proper names)"
      ]
    }
  },
  "diffs_synth_from_real": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Tasks in B require making reservations/bookings (e.g., hotel rooms, restaurant tables) while A focuses on information retrieval without transactional actions",
        "B includes tasks requiring combination of temporal constraints (specific dates/days) with other filters, while A's time-related tasks focus on static operational hours",
        "B contains tasks that require checking specific menu items (e.g., Crispy Chicken Sandwich) while A focuses on general amenities",
        "B features tasks involving price tier specifications (e.g., 'moderately priced') as explicit search criteria, while A focuses on price information extraction",
        "B includes tasks requiring confirmation of specific service attributes (e.g., gluten-free options, wheelchair accessibility) while A focuses on general accessibility features",
        "B contains tasks that combine multiple transportation modes with intermediate stops (e.g., bike ride + coffee shop), while A focuses on single-mode route planning",
        "B features tasks requiring comparison of user review content (e.g., mentions of specific food items) while A focuses on rating comparisons",
        "B includes tasks with explicit guest count specifications for accommodations, while A's accommodation tasks focus on general availability",
        "B contains tasks that require checking current/open-now status more frequently than A",
        "B features tasks using relative location descriptors ('near me') more prominently than A's zip code-based searches"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset B tasks emphasize booking accommodations with explicit budget constraints (e.g., $400 for 2 nights), while Dataset A prioritizes route planning without cost parameters.",
        "Dataset B includes tasks requiring interaction with user-generated photo content (e.g., viewing park photos), absent in Dataset A's structured data extraction from reviews.",
        "Dataset B contains queries about medical infrastructure (e.g., hospitals, pharmacies) and accessibility-specific transportation, unlike Dataset A's commercial/public facility focus.",
        "Dataset B tasks request environmental/ecological information (e.g., white-nose syndrome, deforestation rates) not present in Dataset A's location-based queries.",
        "Dataset B requires explicit reservation management (e.g., table bookings with dates) where Dataset A focuses on availability checks without transactional follow-through.",
        "Dataset B includes price comparison tasks for services (e.g., hotel rates) while Dataset A compares spatial/operational attributes like distance or hours.",
        "Dataset B tasks specify hospitality service policies (e.g., free cancellation) whereas Dataset A emphasizes physical amenities like parking availability.",
        "Dataset B contains food service requirements with cuisine specificity (e.g., French brunch spots) compared to Dataset A's generic category filters.",
        "Dataset B involves online ordering capabilities for restaurants, a feature never mentioned in Dataset A's transactional actions.",
        "Dataset B tasks reference chain restaurant locations (e.g., The Melt in California) while Dataset A queries generic commercial categories."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset B tasks often involve multi-stop itineraries (e.g., 'route from X to Y then to Z') while A focuses on single-leg routes",
        "Tasks in B more frequently require booking/reservations with specific service customization (e.g., 'photos shoots', 'specific floor access') unlike A's generic booking actions",
        "Dataset B contains tasks requiring comparison of temporal availability patterns (e.g., 'start of year', 'New Year's Eve stays') rather than A's immediate time constraints ('open now')",
        "B includes tasks about verifying business-specific operational data (e.g., 'Check reviews for my business') absent in A",
        "Dataset B tasks more commonly combine accessibility requirements with specific cuisine types (e.g., 'wheelchair-accessible Italian restaurant')",
        "Tasks in B frequently involve planning recreational activities (e.g., 'hike to Yosemite Falls with park photos') beyond A's basic location searches",
        "Dataset B contains explicit price verification tasks for specific services (e.g., 'price of cable car ride') not seen in A's general price filtering",
        "B includes tasks requiring identification of attraction-specific infrastructure details (e.g., 'stairs access to Eiffel Tower floor')",
        "Dataset B tasks more often combine guest capacity requirements with accommodation searches (e.g., 'hotel for 3 people')",
        "Tasks in B show stronger emphasis on international/comparative searches (e.g., 'American restaurants in Spain') versus A's domestic focus"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Dataset B tasks frequently require booking accommodations or services with specific dates and guest counts, while A focuses on information retrieval without reservations.",
        "Dataset B includes tasks involving flight search and multi-city travel planning, absent in A which centers on ground transportation.",
        "Tasks in B often specify temporal constraints (e.g., 'open now', 'New Year's Eve dates'), whereas A uses static operational hours.",
        "Dataset B contains explicit budget range requirements (e.g., '$150-$350/night'), while A focuses on price comparisons without monetary thresholds.",
        "B includes tasks requiring multi-day itinerary coordination (e.g., 3-day hotel stays), unlike A's single-journey route planning.",
        "Dataset B emphasizes amenity-specific filters (free Wi-Fi, breakfast inclusion) as primary criteria, whereas A treats amenities as secondary attributes.",
        "Tasks in B require interaction with booking/reservation systems, while A focuses on mapping/geolocation features.",
        "Dataset B includes tasks demanding analysis of user reviews for specific experiential contexts (e.g., hospital patient experiences), beyond A's general rating thresholds.",
        "B contains tasks requiring historical/geographical research (e.g., environmental threats to landmarks), absent in A's practical navigation focus.",
        "Dataset B features price comparison across time periods (e.g., seasonal rates), while A compares concurrent prices."
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Dataset B tasks frequently involve making reservations/bookings (e.g., hotels, restaurants) with specific date/time requirements",
        "Dataset B includes explicit requests for photo content analysis (e.g., 'view photos of landmarks')",
        "Dataset B contains tasks requiring price comparisons and budget optimization strategies",
        "Dataset B emphasizes itinerary planning with predetermined stops/waypoints more prominently",
        "Dataset B includes specific requests for fare/pricing information (e.g., transit fares, hotel rates)",
        "Dataset B tasks more frequently involve date-specific constraints (e.g., 'for New Year's Eve')",
        "Dataset B requires analysis of user review content (e.g., 'review comfort food') rather than just ratings",
        "Dataset B includes requests for elevation profiles and topographic information in route planning",
        "Dataset B contains tasks requiring neighborhood characteristic assessments beyond basic location",
        "Dataset B features more explicit requests for menu analysis and cuisine-specific planning"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in B explicitly require comparing exact pricing figures (e.g. Copilot Pro cost, Codespaces pricing) while A focuses on feature tier comparisons",
        "B contains tasks requiring navigation through security compliance documentation (certifications, trust policies) absent in A's security-related tasks",
        "B includes specific queries about third-party integrations (SAP, Hyperlint AI) while A focuses solely on native GitHub features",
        "Tasks in B require direct interaction with legal/policy documents (terms of service, IP implications) rather than simple policy review as in A",
        "B contains explicit requests for vulnerability reporting procedures while A focuses on vulnerability discovery and remediation features",
        "Mobile-specific functionality exploration (app availability, mobile usage) appears exclusively in B's tasks",
        "B requires analysis of service status/historical incidents (API downtime) not present in A's metrics-focused tasks",
        "Tasks in B demand comparison of AI tool capabilities across development phases (NPC projects, multiple languages) while A focuses on repository characteristics",
        "B includes explicit queries about educational account creation processes where A focuses on resource discovery",
        "Tasks in B require navigation through marketplace integrations (GitHub Apps) absent from A's workflow automation focus"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks in B require locating pricing/plan details for GitHub's own services (Copilot/Enterprise) rather than comparing third-party repository stats",
        "Tasks in B focus on security compliance documentation (advisories, certificates, data policies) absent in A's repository-centric security tasks",
        "Tasks in B involve finding GitHub's internal service documentation (APIs, Actions syntax) rather than user-generated content",
        "Tasks in B require accessing legal/enterprise resources (quotes, compliance certificates) not present in A's educational focus",
        "Tasks in B emphasize GitHub's proprietary tools (Copilot, Codespaces) configuration over A's community content analysis",
        "Tasks in B demand understanding GitHub's data processing policies (Copilot training data) rather than user repository metadata",
        "Tasks in B involve comparing GitHub service tiers (REST vs GraphQL API) instead of repository feature comparisons in A",
        "Tasks in B require navigating GitHub's own job listings/enterprise sales materials absent from A's community focus",
        "Tasks in B focus on GitHub platform administration (linking PRs/issues) rather than contributor statistics analysis in A",
        "Tasks in B demand locating GitHub's partnership announcements/legal updates rather than general open-source project updates"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Dataset B tasks focus more on GitHub Copilot's features, security, and pricing details compared to Dataset A's broader product scope",
        "Dataset B includes explicit queries about GitHub's data usage policies and compliance (e.g., GDPR, confidentiality) not present in Dataset A",
        "Dataset B contains more 'how-to' implementation tasks (e.g., CLI installation, autograding configuration) compared to Dataset A's information retrieval focus",
        "Dataset B emphasizes account creation workflows and privacy settings management more than Dataset A",
        "Dataset B tasks specifically compare GitHub Copilot plans rather than general pricing tiers seen in Dataset A",
        "Dataset B includes troubleshooting scenarios (e.g., error resolution) absent from Dataset A's tasks",
        "Dataset B focuses more on GitHub Copilot extensions/integrations while Dataset A emphasizes core platform features",
        "Dataset B contains explicit testing of navigation robustness (e.g., handling broken links) not seen in Dataset A",
        "Dataset B includes policy-related queries (e.g., private repo confidentiality) absent from Dataset A",
        "Dataset B tasks involve more active configuration of developer tools (e.g., GitHub Actions setup) compared to Dataset A's passive exploration"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Dataset B tasks require comparing GitHub Copilot plans across free/Pro/Enterprise tiers while Dataset A focuses on comparing general pricing tiers (Free/Pro/Enterprise)",
        "Dataset B includes navigation tasks for GitHub's legal terms and privacy policies while Dataset A does not",
        "Dataset B requires finding trial enrollment processes for enterprise/server solutions while Dataset A focuses on exploring enterprise offerings without trial actions",
        "Dataset B contains tasks about GitHub Copilot's data retention policies and security measures while Dataset A focuses on general security content",
        "Dataset B includes API documentation retrieval for custom integrations while Dataset A focuses on general technical documentation",
        "Dataset B tasks involve checking real-time system status/service availability while Dataset A does not",
        "Dataset B requires comparing plan upgrade paths (e.g. Free to Pro) while Dataset A compares static plan features",
        "Dataset B includes external platform references (Google Play Store ratings) while Dataset A stays within GitHub properties",
        "Dataset B tasks target specific CVE identifiers for vulnerability research while Dataset A focuses on general security tools/content",
        "Dataset B contains account management tasks (upgrades, trials, cancellations) while Dataset A focuses on information retrieval"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Dataset A tasks focus on locating specific repository details (e.g., forks, contributors, recent commits)",
        "Dataset B tasks emphasize understanding feature implementation steps (e.g., creating projects, using task lists)",
        "Dataset A requires identifying numerical metrics (star counts, fork numbers, course quantities)",
        "Dataset B focuses on comparing plan tiers and pricing structures across products",
        "Dataset A tasks involve temporal filtering (repos updated within specific date ranges)",
        "Dataset B tasks require understanding feature limitations and upgrade paths between plans",
        "Dataset A emphasizes discovery of trending/open-source projects through search criteria",
        "Dataset B focuses on documentation navigation for specific workflows (security reporting, API usage)",
        "Dataset A tasks require identification of top contributors/maintainers in repositories",
        "Dataset B emphasizes troubleshooting scenarios (error resolution, rate limit handling)"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Dataset B tasks emphasize real-time updates and upcoming games, while Dataset A focuses on retrieving historical game data and past results",
        "Dataset B includes navigation tasks for international soccer leagues (e.g. Premier League, Serie A) not present in Dataset A's sport focus",
        "Dataset B requires comparison of betting odds and moneyline predictions across multiple concurrent matchups, while Dataset A only mentions basic betting information retrieval",
        "Dataset B tasks demand navigation through postseason/bowl game content (CFP, Conference League) not featured in Dataset A's regular season-focused queries",
        "Dataset B contains requests for multi-team injury reports and roster changes, whereas Dataset A focuses on single-team injury status checks",
        "Dataset B requires identification of broadcast network combinations (e.g. TNT/truTV/Max) while Dataset A only specifies single network information",
        "Dataset B includes fantasy basketball research tasks (player lineup impacts, fantasy picks) not present in Dataset A's statistical queries",
        "Dataset B tasks involve live gamecast navigation with quarter/minute-specific updates, whereas Dataset A focuses on final period statuses",
        "Dataset B requires navigation of collegiate ranking systems (AP rankings in NCAAM/NCAAW) not present in Dataset A's team abbreviation-based tasks",
        "Dataset B includes cross-sport comparison tasks (NFL vs. NBA schedules) while Dataset A maintains league-specific navigation requirements"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Dataset B includes tasks requiring navigation across multiple sport types (e.g., simulating NBA trades with soccer players) while A focuses on single-sport navigation",
        "B requires accessing international sports content (e.g., Portuguese Primeira Liga) not present in A's domestic-focused tasks",
        "B contains explicit betting odds analysis (NFL Week 17/Super Bowl odds) whereas A focuses purely on game statistics",
        "B features cross-sport comparisons (e.g., college football vs NFL odds) while A comparisons remain within same sport",
        "B includes fantasy sports integration beyond basketball (baseball/hockey) vs A's basketball-focused fantasy tasks",
        "B requires navigation of time-bound playoff scenarios (NFL Week 18 standings) vs A's general standings access",
        "B contains soccer-specific transfer market tracking tasks absent from A's player movement queries",
        "B features international competition tracking (US vs Canada friendly) not found in A's domestic match focus",
        "B includes browser navigation patterns (back button behavior) beyond A's in-page navigation requirements",
        "B requires access to collegiate football bowl game schedules/rankings while A focuses on professional league standings"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset B tasks frequently involve retrieving schedules and results for collegiate bowl games and the College Football Playoff (CFP), whereas Dataset A emphasizes NBA postseason tracking.",
        "Dataset B includes explicit queries for sports betting odds (e.g., AFC Champion odds), which are absent in Dataset A tasks.",
        "Dataset B requires navigation to international soccer leagues (e.g., English Premier League, Serie A), while Dataset A focuses primarily on U.S.-centric leagues (NBA, NFL, NCAA).",
        "Dataset B tasks often specify temporal constraints by NFL/CFB weeks (e.g., 'Week 18') or seasons (e.g., '2024-25'), whereas Dataset A uses relative temporal filters (e.g., 'past 2 days').",
        "Dataset B includes requests for gamecast/play-by-play data (e.g., live game tracking), while Dataset A focuses on post-game summaries and box scores.",
        "Dataset A tasks involve comparative statistical calculations (e.g., GP percentage comparisons), whereas Dataset B prioritizes factual retrievals (e.g., standings, schedules).",
        "Dataset B emphasizes NFL team performance metrics (e.g., 'Broncos' stats') and CFB rankings, while Dataset A centers on NBA player/team analytics (e.g., LeBron James' career stats).",
        "Dataset B requires navigation to fantasy football/basketball rankings and draft tools, whereas Dataset A focuses on Tournament Challenge brackets.",
        "Dataset B tasks include historical team season results (e.g., 'Plymouth Argyle 2022-23'), while Dataset A emphasizes real-time player transactions.",
        "Dataset B features queries for broadcast/viewing details (e.g., 'channel for Tottenham match'), absent in Dataset A's ESPN+ integration tasks."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Tasks in B emphasize college football (NCAAF) bowl games and College Football Playoff (CFP) navigation, while A focuses on NBA/NFL/NCAA basketball/NHL leagues.",
        "B includes tasks related to podcasts and ESPN Radio content discovery, not prominently featured in A.",
        "B requires retrieving fantasy baseball rankings (e.g., 2025 projections), while A focuses on basketball/football fantasy tools.",
        "B contains explicit requests for ESPN+ original programming exploration, whereas A emphasizes ESPN+ streaming of live games.",
        "B involves finding transfer news between specific clubs (e.g., Real Madrid to Arsenal), while A focuses on general NBA trade updates.",
        "B includes historical player statistics queries (e.g., Michael Jordan's 2002-03 season), whereas A focuses on current/recent season stats.",
        "B requires locating specific game broadcasts on ESPN+ (e.g., Sabres vs. Blues), while A focuses on general broadcast info parsing.",
        "B tasks involve navigation to team-specific pages (e.g., Boston Red Sox team section), while A focuses on league-wide standings/schedules.",
        "B includes postseason elimination status tracking (e.g., CFP eliminated teams), while A focuses on playoff bracket navigation.",
        "B contains direct roster comparison tasks between NFL teams, whereas A focuses on statistical comparisons between players/teams."
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Dataset B focuses on NFL season-specific queries (e.g., 2024 NFL season team performance) while A emphasizes real-time NBA/NBA-centric metrics",
        "B includes tasks requiring NCAAF bowl game schedules/results (e.g., Week 16 scores), whereas A focuses on NCAA basketball tournaments",
        "B contains explicit requests for cross-team trade simulations (e.g., Hawks/Celtics/Bulls trades) absent in A",
        "B requires navigation to lacrosse/NLL scores and standings, which are not present in A's sport coverage",
        "B tasks involve checking playoff probabilities (e.g., Patriots' 2024 NFL playoff chances) vs A's general standings lookups",
        "B includes soccer transfer news tracking (EPL transfers) while A focuses on NBA trades",
        "B tasks demand historical game result retrieval (e.g., specific Jan 4 Bills game) vs A's emphasis on latest/most recent games",
        "B contains esports-related queries absent in A's traditional sports focus",
        "B requires fantasy baseball statistical analysis (Red Sox pitching/batting metrics) where A focuses on basketball fantasy tools",
        "B tasks involve multi-league comparisons (NHL standings + NFL schedules) while A maintains single-league navigation flows"
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Dataset B tasks involve locating academic research papers and their abstracts/citations, while Dataset A focuses on extracting model/dataset metadata",
        "Dataset B requires navigating API endpoint documentation for model deployment, whereas Dataset A focuses on API usage for inference/generation",
        "Dataset B includes tasks about content generation capabilities (image/code generation), while Dataset A focuses on technical specifications",
        "Dataset B contains accessibility-focused tasks (HTML conversions, format adaptations) not present in Dataset A",
        "Dataset B tasks involve community interaction for tutorial assistance, while Dataset A focuses on community metrics (likes/followers)",
        "Dataset B emphasizes commercial product integration scenarios, whereas Dataset A focuses on license verification",
        "Dataset B includes environment setup tasks (Google Colab integration), while Dataset A focuses on existing framework usage",
        "Dataset B requires identifying beginner-friendly resources/tutorials, while Dataset A assumes technical proficiency",
        "Dataset B tasks involve dataset structure analysis (row counts/splits), while Dataset A focuses on dataset attributes",
        "Dataset B includes specific library integration tasks (timm, Keras), while Dataset A mentions frameworks generally"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks require academic research paper retrieval (e.g. 'Find DialogGPT research paper') while A focuses on technical documentation search",
        "Dataset B tasks involve model performance benchmarking against specific evaluation metrics (e.g. 'MMMU benchmark') unlike general metric comparisons in A",
        "Dataset B tasks require understanding model architecture details (e.g. 'architecture and uses') while A focuses on basic model metadata",
        "Dataset B contains tasks about commercial usage restrictions (e.g. 'non-commercial use with attribution') beyond basic license type retrieval in A",
        "Dataset B tasks involve interactive model testing (e.g. 'Try out Text-to-Image model') while A focuses on API implementation instructions",
        "Dataset B requires identification of models for specific commercial applications (e.g. 'commercial product development') unlike general modality identification in A",
        "Dataset B tasks involve community interaction features (e.g. 'make comment on discussion') not present in A's information retrieval focus",
        "Dataset B requires dataset format specificity (e.g. 'text format datasets') vs A's general format requirements",
        "Dataset B tasks include ethical AI considerations (e.g. 'ethical conversational AI') absent from A's technical requirements",
        "Dataset B contains troubleshooting scenarios (e.g. 'report issues accessing model') not seen in A's task structures"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset B tasks require retrieving information about academic research papers and their implementations, while Dataset A focuses on practical model deployment",
        "Dataset B includes tasks involving troubleshooting error messages (e.g., 'Task not found' errors), which are absent in Dataset A",
        "Dataset B contains tasks requiring access to source code repositories and GitHub integration, unlike Dataset A",
        "Dataset B features tasks requiring non-English language support (e.g., German documentation), while Dataset A focuses on English content",
        "Dataset B includes explicit requirements for commercial product integration analysis, whereas Dataset A focuses on general enterprise features",
        "Dataset B tasks demand understanding of performance optimization techniques for LLMs, not present in Dataset A",
        "Dataset B requires navigation through model versioning history, while Dataset A focuses on latest version identification",
        "Dataset B contains tasks specifically addressing low-resource language support, whereas Dataset A focuses on general language compatibility",
        "Dataset B includes tasks requiring accessibility guideline interpretation (e.g., LaTeX accessibility), absent in Dataset A",
        "Dataset B tasks involve detailed technical configuration parameters (e.g., numerical precision requirements), while Dataset A focuses on general API usage"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks focus more on model discovery for specific use cases (e.g. dog breed classification) rather than comparative ranking",
        "Dataset B requires direct interaction with model code/implementation (commit history, SDK installation) unlike Dataset A",
        "Dataset B tasks involve troubleshooting (error resolution, documentation gaps) not present in Dataset A",
        "Dataset B emphasizes commercial application requirements (licensing for commercial use, deployment costs)",
        "Dataset B requires identification of model architectures (base models, MoE variants) rather than just capabilities",
        "Dataset B tasks focus more on model accessibility (beginner resources, API widget types) than Dataset A",
        "Dataset B includes explicit paper-to-model cross-referencing (finding associated research papers)",
        "Dataset B tasks require handling multilingual content (translation models, multilingual datasets)",
        "Dataset B emphasizes practical deployment (model serving, endpoint pricing) over technical specifications",
        "Dataset B tasks involve community interaction features (liking models, accessing user projects)"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset A tasks require specifying exact version numbers or release dates (e.g. 'March 2023') while Dataset B uses broader temporal references",
        "Dataset B includes tasks requiring ethical compliance checks (e.g. 'align with ethics policy') not present in Dataset A",
        "Dataset A emphasizes GitHub star counts as success metrics while Dataset B focuses more on practical implementation metrics",
        "Dataset B contains tasks requiring dataset format conversions (e.g. Parquet format) not mentioned in Dataset A",
        "Dataset A tasks specify exact license types (Apache-2.0, cc-by-sa-4.0) while Dataset B refers to general usage licenses",
        "Dataset B includes installation/testing procedures (e.g. 'Install and test Safetensors') absent from Dataset A",
        "Dataset A requires identification of numeric popularity metrics (e.g. '1M+ downloads') while Dataset B focuses on functional capabilities",
        "Dataset B tasks involve academic paper research (e.g. 'Find DialoGPT research paper') not required in Dataset A",
        "Dataset A focuses on model attribute extraction while Dataset B includes API experimentation tasks (e.g. 'experiment with vision and NLP models')",
        "Dataset B contains CPU optimization tasks (e.g. 'optimize model inference on CPU') not present in Dataset A"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Tasks in Dataset A require identifying specific course modules (e.g., 'Measuring Sustainability'), while Dataset B focuses on general course topics without module-level granularity.",
        "Dataset A includes tasks requesting instructor biographies and other courses taught by them, whereas Dataset B does not emphasize instructor background details.",
        "Dataset A tasks involve checking degree program admission deadlines, while Dataset B lacks explicit queries about degree enrollment timelines.",
        "Dataset B tasks frequently require comparing multiple courses (e.g., 'Compare different project management courses'), while Dataset A focuses on singular course identification.",
        "Dataset B emphasizes career-oriented skill acquisition (e.g., 'data analyst role skills'), whereas Dataset A prioritizes course attribute verification like duration and certificates.",
        "Dataset A tasks specify language requirements (e.g., English courses), while Dataset B queries are language-agnostic unless explicitly stated.",
        "Dataset A requires identifying exact video counts within course modules, while Dataset B focuses on broader curriculum exploration.",
        "Dataset B includes queries about guided projects (e.g., 'beginner Python guided project'), which are absent in Dataset A tasks.",
        "Dataset A tasks demand verification of university partnerships from specific regions (e.g., Australian institutions), while Dataset B lacks geographic institutional filters.",
        "Dataset B emphasizes finding newest/latest courses through sorting (e.g., 'sort by newest'), while Dataset A prioritizes established course offerings with predefined filters."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Dataset B tasks require identifying application deadlines for degree programs, while A focuses on course/program durations",
        "Dataset B includes queries for courses with specific language offerings (e.g., German), not present in A",
        "Dataset B tasks involve searching for broader learning categories (e.g., 'social impact'), while A focuses on technical skill components",
        "Dataset B requires verifying enrollment processes rather than certification availability like in A",
        "Dataset B tasks demand comparison of career development paths rather than direct credential comparisons in A",
        "Dataset B includes queries about recommended experience levels for roles, absent in A's prerequisite focus",
        "Dataset B tasks request general course descriptions without module-level details required in A",
        "Dataset B requires identification of university admission processes rather than partner verification like A",
        "Dataset B tasks involve searching for free course options as primary criteria, unlike A's certification focus",
        "Dataset B includes queries about program solutions/offerings rather than specific provider partnerships like A"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset B tasks emphasize career path exploration (e.g., 'Data Scientist', 'Product Manager') while Dataset A focuses on career-related metrics like salary/job availability",
        "Dataset B queries prioritize skill acquisition descriptions over instructor bios compared to Dataset A's frequent requests for instructor details",
        "Dataset B includes direct requests for course/program definitions (e.g., 'Find the definition of data analytics') absent in Dataset A",
        "Dataset B tasks show stronger focus on AI/ML specialization (e.g., 'Prompt Engineering', 'Generative AI') compared to Dataset A's general AI queries",
        "Dataset B contains more requests for course comparisons (e.g., 'Compare Python courses for business professionals') not seen in Dataset A",
        "Dataset B tasks frequently specify language requirements (e.g., 'taught in English') unlike Dataset A's language-agnostic queries",
        "Dataset B includes browser/technical requirements (e.g., 'download Firefox browser') absent in Dataset A tasks",
        "Dataset B shows increased focus on educational prerequisites (e.g., 'focus on Linear Algebra') compared to Dataset A's skill-level filters",
        "Dataset B tasks request content verification from specific providers (e.g., 'IBM AI course topics') while Dataset A seeks institutional affiliations",
        "Dataset B contains more meta-queries about course structure (e.g., 'first two modules of...') compared to Dataset A's module existence checks"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B tasks require evaluating user reviews and testimonials as part of course selection criteria.",
        "Dataset B includes tasks involving configuration of course settings (e.g., language preferences, subtitles).",
        "Dataset B emphasizes enrollment processes, such as accessing enrollment pages or signing up for courses.",
        "Dataset B tasks focus on researching detailed course content, including module structures and specific learning components.",
        "Dataset B requires multi-step navigation (e.g., search, filter, review content) to complete tasks.",
        "Dataset B prioritizes courses with explicit mentions of hands-on or project-based learning experiences.",
        "Dataset B tasks involve exploring career-specific pathways tied directly to job roles (e.g., Data Analyst, Cybersecurity Analyst).",
        "Dataset B includes tasks requiring comparisons across course types (e.g., Professional Certificates vs. Degrees) with granular detail.",
        "Dataset B tasks explicitly mention financial aid options and free certification requirements.",
        "Dataset B focuses on courses with specific instructional formats (e.g., labs, interactive projects) rather than general descriptions."
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require extracting enrollment processes and admission requirements for degree programs, while Dataset A focuses on course/module specifics.",
        "Dataset B tasks involve identifying course/program refund policies and language options (e.g., English), absent in Dataset A.",
        "Dataset A tasks emphasize granular numerical metrics (e.g., weekly time commitments, exact review counts), whereas Dataset B prioritizes program structures.",
        "Dataset B tasks include exploring career alignment with courses (e.g., 'Find a job matching a Coursera course'), unlike Dataset A.",
        "Dataset A tasks require filtering by explicit rating thresholds (e.g., '4+ stars'), while Dataset B lacks rating-based filtering in its samples.",
        "Dataset B tasks involve comparing distinct specializations/parcours within a subject, whereas Dataset A focuses on individual course components.",
        "Dataset A tasks require identifying partner institutions/companies by region (e.g., Australia), while Dataset B does not mention geographic filters.",
        "Dataset B tasks include verifying credential applicability to degrees (e.g., 'pre-approved credits'), absent in Dataset A.",
        "Dataset A tasks demand testimonials for Specializations, whereas Dataset B does not reference user testimonials.",
        "Dataset B tasks explicitly seek administrative details (e.g., 'clear refund policies'), while Dataset A focuses on content/outcomes."
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Dataset B requires interacting with specific paper formats (HTML vs PDF) for content analysis, while A focuses on format distinction without content interaction",
        "Tasks in B involve troubleshooting document conversion errors (e.g., HTML formatting issues) not present in A",
        "B includes tasks requiring direct paper downloads and local content analysis (e.g., formula counting) absent in A",
        "Dataset B contains tasks requiring navigation through paper revision history and version comparisons",
        "B requires interaction with arXiv's operational status notifications and system alerts not mentioned in A",
        "Tasks in B involve specific arXiv identifier lookup and direct paper access using IDs (e.g., arXiv:2411.04175)",
        "Dataset B includes multimedia content handling (e.g., video references in papers) not required in A",
        "B requires navigation through paper components/sections (e.g., 'related work' sections) beyond metadata extraction",
        "Tasks in B involve cross-referencing with external academic platforms (e.g., dblp) beyond arXiv's internal structure",
        "Dataset B contains tasks requiring understanding of arXiv's licensing information and copyright policies"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Dataset B tasks require locating specific paper sections (e.g., introductions, experimental setups) not emphasized in Dataset A",
        "Dataset B includes tasks involving direct arXiv identifier searches (e.g., 2412.21185), while Dataset A focuses on metadata filters",
        "Dataset B contains tasks requiring technical troubleshooting (e.g., HTML error investigation) absent in Dataset A",
        "Dataset B tasks demand handling interdisciplinary research queries across STEM fields, unlike Dataset A's discipline-specific focus",
        "Dataset B includes format-specific download requirements (e.g., source code retrieval) not present in Dataset A tasks",
        "Dataset B requires interaction with submission guidelines/author instructions, while Dataset A focuses on content extraction",
        "Dataset B tasks involve figure/table localization within papers (e.g., AlN-on-sapphire grating coupler figures), unlike Dataset A's metadata focus",
        "Dataset B contains explicit requests for paper revision/resubmission processes absent in Dataset A",
        "Dataset B tasks require comparison of search term variations (e.g., CVPR 2023 vs CVPR2023) as core objectives rather than incidental actions",
        "Dataset B includes author-specific publication tracking across multiple categories (e.g., Kai Schmitz's work) requiring cross-archive navigation"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset A tasks require filtering by subcategories within hierarchical subject classifications (e.g., Physics > Astrophysics > astro-ph.EP), while Dataset B tasks do not involve subcategory navigation.",
        "Dataset A tasks explicitly require cross-archive comparison of search results (e.g., comparing category-specific vs all-archive results), while Dataset B tasks focus on single-resource retrieval.",
        "Dataset A includes tasks requiring identification of specific metadata elements like journal references (e.g., ACL Workshop), while Dataset B focuses on basic citation information retrieval.",
        "Dataset A tasks require interpretation of multi-language submission guidelines, while Dataset B tasks involve license type identification and copyright information extraction.",
        "Dataset A tasks require formula identification and counting within papers, while Dataset B tasks focus on section-level content retrieval (e.g., Results, Methodology).",
        "Dataset A includes time-bound queries with precise date ranges (e.g., October 2023), while Dataset B uses relative timeframes without specific calendar dates.",
        "Dataset A tasks require first-author affiliation tracking, while Dataset B emphasizes author-based searches without affiliation requirements.",
        "Dataset A contains tasks requiring format comparison (HTML vs PDF content analysis), while Dataset B focuses on format-specific access (PDF download instructions).",
        "Dataset A tasks involve Boolean operator usage (e.g., journal reference vs title search variations), while Dataset B uses simple keyword matching without operator specifications.",
        "Dataset A includes leadership team identification requirements, while Dataset B tasks involve system troubleshooting (layout errors, accessibility formats)."
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset B tasks require extracting specific technical sections from papers (e.g., methodology, references) while Dataset A focuses on metadata retrieval",
        "Dataset B contains tasks involving explicit paper identifiers (arXiv IDs) for direct access, unlike Dataset A",
        "Dataset B includes operational queries about source code downloads for physics articles, absent in Dataset A",
        "Dataset B tasks demand verification of paper existence in specific repositories (e.g., Computing Research Repository) not required in Dataset A",
        "Dataset B requires interpretation of technical content like equations/figures while Dataset A focuses on result counts/summaries",
        "Dataset B contains queries about cross-repository validation (e.g., Computer Science vs. Physics archives) whereas Dataset A stays within category boundaries",
        "Dataset B includes non-English character handling in queries (e.g., Arabic/Persian text) not present in Dataset A tasks",
        "Dataset B tasks target conclusion sections of papers specifically, unlike Dataset A's abstract-focused requirements",
        "Dataset B contains malformed/multi-lingual query handling (e.g., mixed scripts) absent from Dataset A's standardized syntax",
        "Dataset B requires navigation through paper structure (sections, figures) while Dataset A focuses on archive-level navigation"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Dataset A tasks require precise metadata extraction (e.g., author affiliations), while Dataset B focuses on basic document retrieval without affiliation identification",
        "Dataset A includes explicit format-specific retrieval requirements (PDF/HTML/TeX), while Dataset B tasks only mention downloading without format constraints",
        "Dataset A contains multi-language abstract handling requirements, while Dataset B shows no evidence of language diversity considerations",
        "Dataset A tasks demand temporal precision (e.g., 'last two days'), while Dataset B uses vague temporal references ('recent')",
        "Dataset A requires citation/bibliography cross-referencing, while Dataset B only involves simple reference lookups",
        "Dataset A tasks specify exact category hierarchies (e.g., 'Physics \u2192 Astrophysics'), while Dataset B uses broader subject areas",
        "Dataset A includes journal reference filtering requirements absent in Dataset B tasks",
        "Dataset B contains technical support queries (e.g., submission troubleshooting) not present in Dataset A",
        "Dataset A requires comparison between search scopes (category-specific vs cross-archive), while Dataset B uses simple archive-wide searches",
        "Dataset A tasks involve quantitative result analysis (e.g., 'how many formulas'), while Dataset B focuses on qualitative content retrieval"
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Dataset B requires handling pop-up surveys/interruptions during navigation (e.g., 'Take Survey' dialog) while Dataset A maintains uninterrupted browsing flow",
        "Dataset B tasks involve locating time-sensitive content with absolute timestamps (e.g., '25 December 2024') compared to Dataset A's relative timestamps (e.g., 'X hrs ago')",
        "Dataset B contains tasks requiring identification of content through embedded multimedia previews (e.g., video stills from web series) not present in Dataset A's article formats",
        "Dataset B requires navigation through quiz-style interactive content (e.g., 'Quiz of the Year') absent from Dataset A's task requirements",
        "Dataset B tasks involve parsing content with dual geographic categorization (e.g., '24 hrs ago Africa') compared to Dataset A's single geographic tags",
        "Dataset B contains tasks requiring identification of content through compound category labels (e.g., 'Premier League' within sport sections) while Dataset A uses simpler hierarchical categorization",
        "Dataset B requires handling content cards with scientific/technical terminology (e.g., 'biodiversity hotspot', 'horologist') not prevalent in Dataset A's general news vocabulary",
        "Dataset B tasks involve navigating content with extended temporal ranges (e.g., 'Twenty years on' retrospectives) beyond Dataset A's focus on immediate recency",
        "Dataset B requires differentiation between official statements and speculative reporting (e.g., 'Russia warns against hypotheses') as a distinct content type",
        "Dataset B contains tasks requiring identification of content through cultural/historical context markers (e.g., '1970s dance style', '248-year history') absent from Dataset A's contemporary focus"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Dataset A tasks require locating and summarizing specific articles with precise metadata extraction (e.g., author names, publication dates)",
        "Dataset B tasks involve broader exploration of website sections without explicit summarization requirements",
        "Dataset A emphasizes granular content analysis (e.g., counting players, identifying image details)",
        "Dataset B includes multimedia-focused navigation (e.g., podcast episodes, video content)",
        "Dataset A tasks frequently require time-sensitive verification of live/breaking news updates",
        "Dataset B contains more general information-seeking tasks across non-news sections (e.g., weather, courses)",
        "Dataset A focuses on regional news analysis with specific geographic filters (e.g., Asia, Middle East)",
        "Dataset B includes technical navigation through multi-step trajectories between content types",
        "Dataset A requires identification of thematic categories within structured article metadata",
        "Dataset B tasks involve comparative analysis of content across temporal dimensions (e.g., 2024 scientific achievements)"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset B includes tasks requiring navigation to niche topics not present in A (e.g., Crohn's disease breakthrough, specific music albums)",
        "Dataset B contains tasks involving podcast discovery and BBC Sounds navigation (absent in A's structured article retrieval)",
        "Dataset B requires accessing service-oriented content (e.g., hotel bookings) rather than pure information retrieval",
        "Dataset B features tasks analyzing user-generated content/social media posts (e.g., Vivek Ramaswamy's post)",
        "Dataset B includes hyper-local regional sections (e.g., Glasgow & West Scotland) not seen in A's broader categories",
        "Dataset B contains time-sensitive weather forecasting tasks requiring dynamic data interpretation",
        "Dataset B requires navigation through specialized science sections (e.g., 'Science & Environment' category)",
        "Dataset B tasks involve video-specific navigation (e.g., BBC video page browsing) beyond A's multimedia consumption",
        "Dataset B includes international crisis reporting beyond major regions (e.g., Syria conflict, Estonia investigations)",
        "Dataset B tasks require understanding interactive content formats (e.g., space mission simulations) not present in A"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Tasks in dataset B require gathering information across multiple unrelated sections (e.g., Formula 1 + business), while dataset A focuses on single-section navigation",
        "Dataset B includes tasks requiring practical actions beyond information retrieval (e.g., making donations, watching full-screen videos), unlike dataset A",
        "Dataset A tasks frequently specify exact article titles to locate ('What is climate change...'), while B uses thematic searches",
        "Dataset B contains tasks requiring future-oriented research (e.g., 'remote work trends for 2025'), while A focuses on current/recent events",
        "Dataset A tasks require precise extraction of numerical data (scores, player counts), while B focuses on conceptual understanding",
        "Dataset B includes open-ended exploratory tasks without specific targets ('Browse recent news'), unlike A's focused objectives",
        "Dataset B tasks span non-news domains like technology courses and hotel research, while A stays within news/sports coverage",
        "Dataset A requires identifying specific metadata (author names, exact publication dates) missing in B's requirements",
        "Dataset B tasks involve multi-format content interaction (articles + videos + podcasts), while A focuses on text/images",
        "Dataset B includes geographical-specific navigation (e.g., 'Sydney to Hobart race') requiring regional filtering, unlike A's categorical focus"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Dataset B tasks require navigating through localized or regional news sections not present in Dataset A (e.g., 'Tayside & Central Scotland', 'South Yorkshire')",
        "Dataset B includes tasks involving biographical/historical figure research (e.g., Manmohan Singh, Maria Callas) absent from Dataset A",
        "Dataset B tasks require accessing educational/course information (e.g., university programs) unlike Dataset A",
        "Dataset B contains tasks focused on specific entertainment industry developments (e.g., film sequels, celebrity flops) not emphasized in Dataset A",
        "Dataset B tasks involve navigation through niche sports categories (e.g., Cricket, Premier League teams) rather than hierarchical sport subdivisions",
        "Dataset B requires interaction with service-oriented content (e.g., hotel features, tourism information) absent from Dataset A",
        "Dataset B tasks emphasize finding specific incident reports with precise location details (e.g., 'New Orleans attack', 'Haiti hospital attack')",
        "Dataset B includes tasks requiring identification of diplomatic/military equipment details (e.g., MOD firearm losses) not present in Dataset A",
        "Dataset B tasks involve navigating cultural trend analyses (e.g., dance styles, social media evolution) rather than structured business categories",
        "Dataset B requires accessing anniversary/commemorative content (e.g., 20-year tsunami retrospectives) not featured in Dataset A"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset B tasks focus on general product categories without specific attribute requirements (e.g. 'Find pet supplies') while Dataset A requires detailed technical specifications (e.g. battery life measurements)",
        "Dataset B contains open-ended exploration tasks (e.g. 'Find gift ideas') whereas Dataset A requires structured parameter-based searches",
        "Dataset B tasks frequently involve luxury/pre-owned items (e.g. 'pre-owned Louis Vuitton') not present in Dataset A's new product focus",
        "Dataset B includes brand-specific searches without technical filters (e.g. 'AIRNEX eco-friendly sponges') while Dataset A combines brand with specifications",
        "Dataset B tasks emphasize purchase completion (e.g. 'Add to cart') more than comparison/analysis seen in Dataset A",
        "Dataset A requires explicit price sorting operations while Dataset B uses relative terms like 'cheapest' without sorting instructions",
        "Dataset B contains entertainment-focused tasks (movie rentals) absent from Dataset A's physical product focus",
        "Dataset A tasks require saving/search history management (e.g. 'save the lowest priced') not present in Dataset B",
        "Dataset B includes protection plan purchases (e.g. 'including protection plan') not mentioned in Dataset A tasks",
        "Dataset A requires return policy verification while Dataset B focuses solely on purchase completion"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Tasks in dataset B focus more on purchasing actions (e.g., 'Buy', 'Purchase') rather than detailed product research",
        "Dataset B includes tasks requiring account management (e.g., 'Create an Amazon account') not present in dataset A",
        "Tasks in dataset B emphasize quantity-based actions (e.g., 'Add 2 gifts') rather than attribute-based specifications",
        "Dataset B contains tasks involving digital services (e.g., Prime Video rentals) beyond physical products",
        "Tasks in dataset B frequently reference grocery items (e.g., Amazon Fresh) not seen in dataset A",
        "Dataset B includes luxury brand interactions (e.g., Aquazzura, Louis Vuitton) absent from dataset A",
        "Tasks in dataset B show increased focus on gift shopping without specific quality constraints",
        "Dataset B contains vague objectives (e.g., 'Browse for products') without defined filtering parameters",
        "Tasks in dataset B require handling pre-owned/refurbished items not mentioned in dataset A",
        "Dataset B emphasizes multi-item cart operations (e.g., 'Add 3 skincare products') rather than single-item optimization"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Dataset B tasks require navigation through Amazon's specialized verticals (e.g., Prime Video, Pharmacy, Luxury Stores) not present in Dataset A",
        "Dataset B tasks involve interaction with Amazon subscription services (Prime benefits, Kindle Unlimited, Grubhub partnerships)",
        "Dataset B includes tasks requiring navigation through Amazon Fresh grocery sections and fresh produce listings",
        "Dataset B contains tasks focused on luxury fashion brands and high-end products (e.g., Oscar de la Renta, Aquazzura)",
        "Dataset B tasks require handling CAPTCHA verification processes during grocery/fresh product searches",
        "Dataset B includes tasks involving Amazon Resale/Used marketplace features (e.g., 'pre-loved' items)",
        "Dataset B tasks require navigation through Amazon's virtual try-on/wishlist features for fashion items",
        "Dataset B contains tasks focused on Amazon's in-house brands (e.g., Amazon Basics) as primary targets",
        "Dataset B tasks involve interacting with time-sensitive Prime member exclusives (e.g., NFL Wild Card access)",
        "Dataset B includes tasks requiring navigation through Amazon's \"Shop by Interest\" curated collections"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks in B require purchasing or adding generic categories (e.g., 'pet supplies') without price constraints, while A requires specific price-filtered items",
        "B includes instructions to find 'most expensive' items, whereas A exclusively focuses on budget-conscious searches",
        "B tasks involve product customization (e.g., 'customize AUTOMET Women's Oversized Flannel Shacket'), which never appear in A",
        "B contains gift-focused navigation (e.g., 'holiday gift ideas for gamer friends') absent from A's task structure",
        "B requires identifying Amazon-specific programs (e.g., 'Prime Video free trial') while A focuses on core product attributes",
        "Tasks in B mention seasonal sales events (e.g., 'Winter Sale') as primary filters, whereas A uses permanent deal identification",
        "B includes instructions to verify product legitimacy ('Check if search result is legitimate') not present in A's validation requirements",
        "B contains author-specific book searches (e.g., 'book written by J.K. Rowling') while A uses publication year/review thresholds",
        "B tasks require multi-item cart additions without specifications (e.g., 'Add 3 items'), whereas A specifies exact item characteristics",
        "B emphasizes brand-specific luxury products (e.g., 'Louis Vuitton', 'Aquazzura') absent from A's mid-range product focus"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Tasks in dataset A require specifying detailed product attributes (e.g., material, dimensions), while dataset B tasks are more generic (e.g., 'find eco-friendly products').",
        "Dataset A tasks involve explicit filtering/sorting by criteria (e.g., price range, ratings), whereas dataset B tasks lack such granular filtering requirements.",
        "Dataset A tasks often require validating customer reviews/ratings (e.g., '4+ stars'), while dataset B tasks rarely mention review verification.",
        "Dataset B includes tasks focused on testing website functionality (e.g., CAPTCHA), absent in dataset A.",
        "Tasks in dataset A require structured navigation (e.g., 'filter by size 6'), while dataset B has open-ended browsing (e.g., 'browse fashion').",
        "Dataset A emphasizes price comparisons between specific items, whereas dataset B focuses on singular price checks (e.g., 'find the price').",
        "Dataset B tasks frequently involve adding items to cart without detailed specifications (e.g., 'add 5 items'), unlike dataset A's criteria-based cart actions.",
        "Dataset A tasks require verifying delivery/shipping options explicitly, while dataset B tasks assume availability (e.g., 'buy pet supplies').",
        "Dataset B includes vague gift-related tasks (e.g., 'birthday gift ideas'), whereas dataset A specifies gifting parameters (e.g., 'graduation gift').",
        "Dataset A tasks often require product availability checks (e.g., stock status), while dataset B tasks omit inventory verification steps."
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Dataset B tasks include simpler arithmetic problems (e.g., 'Calculate 3 times 5') compared to Dataset A's advanced computational tasks.",
        "Dataset B tasks frequently involve general information retrieval (e.g., 'Find information on Spain's foreign debt') without requiring domain-specific calculations.",
        "Dataset B tasks explicitly request step-by-step explanations for basic problems, while Dataset A assumes computational proficiency.",
        "Dataset B includes health-related queries (e.g., BMI calculation, colon cancer causes) absent in Dataset A.",
        "Dataset A tasks demand precise real-world data (e.g., 'Itaipu Dam in 2023'), while Dataset B uses broader contexts (e.g., 'global climate study').",
        "Dataset B tasks focus on definitions and conceptual exploration (e.g., 'paradoxes', 'logic and truth') rather than applied problem-solving.",
        "Dataset A emphasizes multi-variable physical systems (e.g., spring pendulum dynamics), while Dataset B lacks such complexity.",
        "Dataset B includes linguistic analysis (e.g., 'Look up the word \"termination\"') not present in Dataset A.",
        "Dataset B tasks involve personal finance calculations (e.g., annuity payments), whereas Dataset A focuses on abstract financial metrics (e.g., raw memory of images).",
        "Dataset A requires visualizing mathematical outputs (e.g., plotting curves), while Dataset B prioritizes textual explanations (e.g., 'custom-viewable image of equation')."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Dataset B tasks more frequently require retrieval of historical, biographical, or cultural facts (e.g., Apollo 11 dates, King Charles III info).",
        "Dataset B includes explicit requests for step-by-step algebraic solutions (e.g., 'Solve 4x + 3 = 19') at a foundational level.",
        "Dataset B tasks involve exploratory queries about platform functionalities (e.g., 'Investigate Wolfram Language capabilities').",
        "Dataset B emphasizes conceptual explanations (e.g., 'What is the Riemann Hypothesis?') over purely computational outputs.",
        "Dataset B includes linguistics/humanities-focused tasks (e.g., etymology of 'love', logical paradoxes).",
        "Dataset B tasks often combine simple statistical computations with data interpretation (e.g., variance, standard deviation).",
        "Dataset B features requests for generating functions or mathematical sequence exploration (e.g., 'A000108 generating function').",
        "Dataset B integrates social science/historical contexts (e.g., GDP analysis, COVID-19 trends) alongside STEM.",
        "Dataset B tasks frequently merge computation with explanatory or descriptive outputs (e.g., 'Explain Buckingham pi theorem').",
        "Dataset B requires broader knowledge base access (e.g., climate models, elemental properties) rather than isolated calculations."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Tasks in dataset B focus on retrieving historical or time-series data (e.g., temperature anomalies, moon phases) compared to dataset A's focus on static or single-instance calculations.",
        "Dataset B includes tasks requiring exploration of conceptual or educational content (e.g., \"explain chemical thermodynamics\"), while dataset A emphasizes direct computational problem-solving.",
        "Dataset B tasks often involve browsing or researching broad topics (e.g., mathematical paradoxes, etymology), whereas dataset A tasks target precise quantitative outputs.",
        "Tasks in dataset B prioritize step-by-step learning workflows (e.g., \"learn about carbon\"), unlike dataset A's focus on executing multi-step calculations with immediate results.",
        "Dataset B includes open-ended queries about relationships between abstract concepts (e.g., Fibonacci and Collatz sequences), while dataset A focuses on closed-form solutions (e.g., integrals, equations).",
        "Dataset B tasks frequently involve descriptive statistics (e.g., beta distribution properties) rather than dataset A's comparative quantitative analyses (e.g., material property comparisons).",
        "Tasks in dataset B request basic definitions or explanations (e.g., \"what is the speed of light?\"), whereas dataset A assumes prior domain knowledge for advanced computations.",
        "Dataset B contains tasks requiring temporal event data (e.g., sunrise times, eclipse dates), while dataset A focuses on atemporal physical or mathematical constants.",
        "Dataset B includes metadata-focused queries (e.g., word etymology, paradox classifications) absent in dataset A's purely numerical problem-solving tasks.",
        "Tasks in dataset B emphasize exploratory navigation (e.g., \"browse polyhedra collections\"), contrasting with dataset A's structured, goal-oriented computational workflows."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Tasks in dataset B involve non-STEM domains like linguistics, economics, and social sciences (e.g., etymology, unemployment rates, historical events).",
        "Dataset B includes queries about Wolfram Alpha's platform features, pricing, or educational resources (e.g., 'Explore Wolfram Alpha's resources').",
        "Dataset B contains financial calculations (e.g., present value of annuities, investment growth) not present in dataset A.",
        "Tasks in dataset B explicitly request conceptual explanations (e.g., 'definition of magnetic field strength', 'learn about paradoxes').",
        "Dataset B includes meta-analytical queries (e.g., 'research mortgage options', 'economic situation analysis') requiring contextual synthesis.",
        "Dataset A focuses exclusively on numerical/visual STEM outputs, while B includes abstract knowledge retrieval (e.g., 'chemical properties of water').",
        "Dataset B contains historical data requests (e.g., 'Industrial Revolution dates', '1950 currency conversion') absent in dataset A.",
        "Tasks in dataset B require interpretation of real-world statistics (e.g., vaccine impact analysis, unemployment rate tracking).",
        "Dataset B includes self-referential queries about Wolfram Alpha's capabilities (e.g., 'features of WolframAlpha platform').",
        "Dataset B contains applied life-science queries (e.g., BMI calculations, weight loss planning) with practical health contexts."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Dataset B tasks focus on basic factual retrieval (e.g., element properties, boiling points) rather than multi-layered computational transformations like percentage composition analysis in Dataset A",
        "Dataset B contains exploratory system-testing instructions (e.g., 'Test various features of WolframAlpha') absent from Dataset A's focused problem-solving tasks",
        "Dataset A requires combined unit conversions with derived quantity calculations (e.g., mass\u2192moles\u2192composition) while Dataset B handles simple unit conversions or none",
        "Dataset B includes elementary equation solving (linear equations) while Dataset A focuses on higher mathematics (differential equations, complex integrals)",
        "Dataset A tasks demand specific visualization outputs (parametric plots, geometric constraints) whereas Dataset B visualization requests are generic/optional",
        "Dataset B contains open-ended exploration tasks (e.g., 'Explore properties of polyhedra') contrasting with Dataset A's precisely parameterized queries",
        "Dataset A requires integration of multiple physical constants/parameters per task while Dataset B uses singular known values (e.g., element atomic numbers)",
        "Dataset B includes personal/life science queries (nutrition, BMI) absent from Dataset A's academic/science-focused tasks",
        "Dataset A tasks frequently combine comparative analysis with computation (material comparisons) while Dataset B comparisons are simpler/numerical",
        "Dataset B contains repetitive basic arithmetic operations whereas Dataset A features unique complex mathematical operations per task"
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Tasks in B require using leftover ingredients or repurposing specific leftover items (e.g., Halloween candy, cranberry sauce), unlike A which focuses on fresh ingredient constraints.",
        "B includes tasks targeting kid-friendly recipes (e.g., vegetarian burgers, snacks) with emphasis on child appeal, while A prioritizes general dietary preferences without age-specific targeting.",
        "B emphasizes seasonal/holiday-specific recipes beyond general holidays (e.g., Thanksgiving, Valentine's Day desserts), whereas A focuses on broader seasonal categories like Easter/Christmas.",
        "Tasks in B involve saving/printing recipes explicitly (e.g., 'save the Zucchini Risotto recipe'), unlike A which focuses on retrieval without explicit saving actions.",
        "B requires locating complementary dishes (e.g., 'what to serve with lasagna'), while A focuses solely on standalone recipe retrieval.",
        "B includes tasks for recipe variations/adaptations (e.g., 'different grilled shrimp recipes'), whereas A emphasizes fixed constraints like exact cooking times.",
        "Tasks in B feature open-ended exploration of categories (e.g., 'healthy recipes', 'Italian Christmas cookies'), while A uses precise numerical thresholds for ratings/reviews.",
        "B contains requests for cooking techniques/tools (e.g., 'without candy thermometer', 'air fryer dinners'), unlike A's focus on ingredient/time constraints.",
        "Tasks in B involve active user participation (e.g., leaving reviews, rating recipes), while A focuses purely on information extraction.",
        "B includes meal prep/storage requirements (e.g., 'storing leftovers', 'meal prep chicken recipes'), whereas A emphasizes immediate preparation details."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Dataset B tasks emphasize meal prep and weekly meal planning, while Dataset A focuses on immediate recipe selection.",
        "Tasks in Dataset B frequently involve modifying or customizing existing recipes based on user preferences, whereas Dataset A prioritizes existing user reviews and ratings.",
        "Dataset B includes tasks related to substituting ingredients (e.g., evaporated milk substitutes), which are absent in Dataset A.",
        "Dataset B tasks require comparing multiple recipe categories (e.g., BBQ sauces vs. grilled vegetables), while Dataset A comparisons focus on individual recipe attributes like cook time.",
        "Tasks in Dataset B explicitly target kid-friendly recipes and snacks, unlike Dataset A.",
        "Dataset B includes holiday-specific recipe planning for non-traditional events (e.g., Halloween snacks, Valentine\u2019s Day cocktails), while Dataset A focuses on major holidays like Easter/Christmas.",
        "Dataset B tasks involve searching for kitchen appliances or pre-made products (e.g., gingerbread house kits), which are not present in Dataset A.",
        "Tasks in Dataset B mention printing/saving recipes for physical use, whereas Dataset A focuses on digital bookmarking.",
        "Dataset B emphasizes full-menu planning (e.g., vegan holiday parties), while Dataset A prioritizes standalone recipes.",
        "Dataset B includes tasks centered on repurposing leftovers (e.g., ham/mashed potatoes), which are absent in Dataset A."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Dataset B tasks include saving/bookmarking specific recipes by name, while A focuses on filtering and listing based on criteria",
        "Dataset B requires comparing nutritional information between multiple recipes, not present in A",
        "Dataset B tasks involve modifying recipes (e.g., ingredient substitutions), absent in A's requirements",
        "Dataset B includes explicit user interaction tasks (e.g., leaving reviews/ratings), not required in A",
        "Dataset B contains broader exploration tasks without specific filters (e.g., 'Find some dinner ideas')",
        "Dataset B features holiday-specific preparation tasks (e.g., gingerbread house construction)",
        "Dataset B tasks reference accessing community tips/featured content, unlike A's independent filtering",
        "Dataset B includes meal planning for events/leftovers management, not emphasized in A",
        "Dataset B tasks require identifying recipe variations/adaptations, while A focuses on static attributes",
        "Dataset B contains price/cost-related queries (e.g., cookbook prices), absent in A's tasks"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Dataset B tasks frequently involve user interaction elements like leaving reviews or asking for ingredient substitutions, while Dataset A focuses on retrieving existing user reviews and ratings.",
        "Dataset A tasks specify exact numerical thresholds for reviews (e.g., 'over 500 reviews'), whereas Dataset B tasks omit such quantitative requirements.",
        "Dataset A tasks explicitly require nutritional details like calorie counts or carbohydrate content, while Dataset B tasks mention dietary constraints (e.g., keto) without always specifying exact nutritional metrics.",
        "Dataset B includes tasks related to meal planning/preparation (e.g., 'meal prep ideas'), which are absent in Dataset A.",
        "Dataset A tasks emphasize finding 'most popular' or 'top-rated' recipes as quality indicators, while Dataset B prioritizes general discovery (e.g., 'explore search results').",
        "Dataset B tasks often involve saving multiple recipes (e.g., 'save multiple BBQ recipes'), whereas Dataset A tasks focus on retrieving a single recipe meeting strict criteria.",
        "Dataset A tasks require structured outputs like ingredient lists and cooking steps, while Dataset B tasks are more open-ended (e.g., 'find ideas for kid-friendly fruit salads').",
        "Dataset B includes tasks about seasonal/holiday-specific recipes across multiple events (e.g., Halloween, Thanksgiving), while Dataset A focuses on broader categories like Easter/Christmas.",
        "Dataset A tasks specify precise cooking time constraints (e.g., 'under 30 minutes'), whereas Dataset B uses vague time references like 'quick' or 'easy.'",
        "Dataset B tasks involve exploring cuisine categories (e.g., 'Asian cuisine'), while Dataset A tasks filter by ingredient/meal type without broader culinary exploration."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks in B require user interaction elements like leaving reviews or community questions",
        "B includes tasks focused on budget-friendly or affordable meal recipes",
        "B tasks specifically target holiday/event-themed recipes (e.g. Christmas, Halloween, Valentine's Day)",
        "B requires navigation through kid-friendly meal categories and snack ideas",
        "Tasks in B emphasize repurposing leftovers and ingredient reuse strategies",
        "B contains explicit requests for international cuisine recipes (e.g. Korean BBQ, Thai)",
        "B tasks involve community engagement features like recipe substitutions and review exploration",
        "B includes meal type-specific navigation (appetizers, desserts) rather than just dish attributes",
        "Tasks in B require interaction with user-generated content (reading/saving others' recipes)",
        "B contains requests for beginner-friendly recipes and cooking skill level considerations"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Dataset B tasks require users to focus on US pronunciation variants exclusively, while Dataset A includes both UK/US variants",
        "Dataset B includes tasks involving syllable count analysis for words, not present in Dataset A",
        "Dataset B tasks require exploration of business-specific vocabulary and terminology, unlike Dataset A",
        "Dataset B contains tasks focused specifically on adjective usage in grammar, while Dataset A covers broader grammatical concepts",
        "Dataset B tasks involve finding synonyms/antonyms related to thematic concepts (e.g., luck/fortune), whereas Dataset A uses general synonym lookup",
        "Dataset B requires Spanish language translations in tasks, while Dataset A focuses on Chinese translations",
        "Dataset B includes tasks analyzing word relationships and conceptual connections between terms, not present in Dataset A",
        "Dataset B contains tasks requiring user feedback on example sentences, a feature absent from Dataset A",
        "Dataset B tasks involve investigating etymological origins of words, while Dataset A focuses strictly on current definitions",
        "Dataset B includes specific references to Scrabble word game information searches, whereas Dataset A only references Word Scramble"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Tasks in B focus on broader language concepts (e.g., greetings, farewells) rather than granular grammar rules (e.g., passive voice, articles).",
        "B includes tasks requiring exploration of collocations and word combinations (e.g., \"affordable accommodation\"), absent in A.",
        "B tasks often omit explicit requirements for pronunciation variants (UK/US) compared to A's emphasis on both.",
        "B tasks involve defining general terminology (e.g., \"business terminology\") without requiring example sentences or contextual usage.",
        "B includes open-ended navigation tasks (e.g., \"test the agent's ability to navigate different pages\") not present in A's structured tasks.",
        "B tasks lack explicit references to interactive elements like the Word Scramble game, which A explicitly requires.",
        "B tasks request basic verb form explanations (e.g., \"basic forms of verbs\") rather than advanced grammar structures (e.g., indirect speech).",
        "B tasks require translating common phrases (e.g., days of the week) rather than specific words with contextual translations.",
        "B includes tasks about conceptual relationships (e.g., \"market research and advertising\"), absent in A's discrete lookup tasks.",
        "B tasks often omit requirements for synonym/antonym specificity (e.g., \"other ways to say...\") compared to A's explicit Thesaurus use."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks in dataset A require providing example sentences as part of word definitions, while dataset B does not consistently mandate example sentence retrieval",
        "Dataset A tasks explicitly require distinguishing between UK/US pronunciations for every word lookup, whereas dataset B tasks may accept single pronunciation variants",
        "Dataset A includes tasks requiring language conversion of the entire interface (e.g., English to Deutsch), while dataset B focuses on content translation between languages",
        "Dataset B contains tasks related to pedagogical research (e.g., teaching methods preparation), which are absent in dataset A",
        "Dataset A tasks specify interaction with named dictionary sections (Thesaurus/Grammar) through explicit navigation, while dataset B uses more generic terminology",
        "Dataset B includes tasks about psychological terminology and specialized idioms that don't appear in dataset A's scope",
        "Dataset A consistently requires users to engage with the Word Scramble game feature, while dataset B tasks never reference specific game interactions",
        "Dataset A tasks demand multi-step verification (definition+pronunciation+translation) for single queries, while dataset B allows simpler single-aspect lookups",
        "Dataset B contains test preparation objectives (e.g., TOEFL vocabulary) not present in dataset A's academic focus",
        "Dataset A uses precise instructional verbs ('report', 'provide', 'convert'), while dataset B employs broader terms ('explore', 'research', 'improve')"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Tasks in dataset B require bulk synonym retrieval (e.g., 'get as many synonyms as possible') rather than single-word/phrase synonym lookups",
        "Dataset B includes multi-word translation requests (e.g., 'apple and Friday') while A focuses on single word/phrase translations",
        "Tasks in B emphasize exploring related terms/concepts (e.g., 'terms related to investments') beyond simple synonym retrieval",
        "Dataset B contains explicit requests for phonetic readings (e.g., 'phonetic reading of shown') not present in A's tasks",
        "B includes quiz completion tasks (e.g., 'animal-related quiz') as distinct interactive elements beyond word games",
        "Tasks in B require broader exploration of website features (e.g., 'explore the features and definitions') rather than targeted lookups",
        "Dataset B contains requests for code retrieval (e.g., 'find the code for solve') not present in A's tasks",
        "B includes comparative analysis tasks (e.g., 'compare meanings and synonyms') between multiple words/concepts",
        "Tasks in B require antonym exploration (e.g., 'emotions and their antonyms') alongside synonym retrieval",
        "Dataset B contains explicit content sharing tasks (e.g., 'share definition on Twitter') as primary objectives rather than incidental features"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Tasks in dataset A explicitly require providing both UK and US pronunciations for words, while dataset B tasks may request only one variant.",
        "Dataset A tasks specify exact languages for translation (e.g., Chinese, French), while dataset B focuses more broadly on Spanish/Dutch translations without explicit language pairing requirements.",
        "Dataset A tasks consistently demand example sentences as part of responses, while dataset B sometimes omits this requirement in favor of pure definitions/phonetics.",
        "Tasks in dataset A reference structured alphabetical browsing (e.g., 'Browse the English Dictionary 0\u20139, a-z'), while dataset B tasks show no evidence of using alphabetical navigation.",
        "Dataset A includes explicit references to annual 'Word of the Year' features (2021-2024), while dataset B tasks lack temporal references to editorial content.",
        "Dataset B tasks involve phrase-level translation queries (e.g., 'break a leg', 'bit by bit'), whereas dataset A focuses exclusively on single-word translations.",
        "Tasks in dataset A require interaction with specific versioned resources (e.g., 'British Grammar'), while dataset B uses generic grammar section references without regional specifications.",
        "Dataset B includes social media sharing tasks (e.g., 'Share on Facebook/X'), which are absent in dataset A's requirements.",
        "Dataset A tasks explicitly reference publication dates for blog content (e.g., March 31, 2025), while dataset B lacks time-bound content interactions.",
        "Dataset B contains exploratory meta-tasks about understanding the dictionary's capabilities, while dataset A focuses purely on information extraction."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Tasks in dataset B require interaction with product customization interfaces (e.g., Watch band combinations)",
        "Dataset B includes navigation tasks focused on enterprise/business solutions (Apple Business Manager setup)",
        "Dataset B contains healthcare-specific product research (medical device compatibility, Health Records)",
        "Tasks in B require locating technical documentation (repair manuals, setup guides)",
        "Dataset B includes educational institution success story exploration (K-12 implementations)",
        "B contains tasks about data privacy implementation details (user data utilization, privacy features)",
        "Dataset B requires comparing accessories across product lines (AirPods Max vs Pro)",
        "Tasks in B involve environmental sustainability plan navigation (carbon neutral roadmap)",
        "Dataset B includes app developer-focused content (App Store Award winner details)",
        "B contains family sharing configuration tasks (member management, setup workflows)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Dataset B includes tasks requiring navigation to enterprise/business-specific sections (e.g., business plans, enterprise team contacts)",
        "Dataset B contains tasks focused on environmental impact comparisons between multiple product categories",
        "Dataset B requires accessing Family Sharing management features beyond basic account settings",
        "Dataset B includes warranty verification processes with repair/service options",
        "Dataset B tasks involve parental control configuration for child devices",
        "Dataset B requires price checks for specific accessories (HomePod, MagSafe cases) rather than main products",
        "Dataset B contains tasks about health monitoring features (calorimetry, battery optimization guides)",
        "Dataset B includes navigation to corporate/financial sections (quarterly earnings, investor relations)",
        "Dataset B requires comparing enterprise-focused services (business purchases, volume licensing)",
        "Dataset B tasks involve multi-device ecosystem management (integration between Mac/iPhone/Apple Watch)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Dataset A tasks focus on exact technical specifications retrieval (e.g., resolution measurements, precise weight values)",
        "Dataset B requires comparing abstract product attributes like environmental sustainability across categories",
        "Dataset A emphasizes identification of concrete numerical values (storage capacities, exact pricing)",
        "Dataset B includes tasks requiring exploration of business/enterprise use cases and success stories",
        "Dataset A focuses on individual consumer product configurations (storage upgrades, color selections)",
        "Dataset B contains explicit requirements to analyze corporate/institutional purchasing options (education discounts)",
        "Dataset A tasks target specific component details (Siri Remote features, camera sensor specs)",
        "Dataset B requires cross-referencing product information with external factors (environmental reports, financial earnings)",
        "Dataset A navigation paths emphasize direct product comparison matrices (generation-to-generation feature differences)",
        "Dataset B includes tasks combining purchasing workflows with research objectives (price checking while reviewing sustainability)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks focus on enterprise/business solutions and corporate service inquiries",
        "Dataset B requires navigation through AppleCare+ and technical support service workflows",
        "Tasks in B involve configuring enterprise-specific product bundles or business-oriented services",
        "Dataset B includes requirements to locate business-focused documentation (e.g. Business Conduct Policy)",
        "B tasks require interaction with enterprise device management and bulk purchasing workflows",
        "Dataset B contains tasks related to financial reporting and investor information access",
        "B tasks involve family account management and parental control configurations",
        "Dataset B requires navigation through environmental impact reports and sustainability documentation",
        "Tasks in B focus on customized product configurations beyond standard consumer options",
        "Dataset B includes requirements to compare business-oriented service plans and enterprise solutions"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks in B require configuring specific product bundles or business-oriented packages (e.g., iPad Pro for business)",
        "Dataset B includes tasks focused on post-purchase support documentation retrieval (e.g., repair manuals, warranty status checks)",
        "B contains tasks requiring analysis of app-specific technical details (e.g., version history, app reviews)",
        "Dataset B emphasizes compatibility checks for accessories with specific device models (e.g., iPhone 16 case compatibility)",
        "Tasks in B require navigation through healthcare-specific use cases (e.g., Health Records enrollment statistics)",
        "Dataset B includes queries about enterprise/business solutions documentation and pricing",
        "B tasks focus on battery optimization guidance and device maintenance troubleshooting",
        "Dataset B requires comparison of privacy/security features across services (e.g., data handling policies)",
        "Tasks in B involve parental control configuration and family sharing limitations",
        "Dataset B contains specific carrier/band customization tasks for wearable devices"
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Dataset B tasks commonly involve consumer purchase decisions (e.g., product comparisons, price checks, shipping details) while A focuses purely on technical specifications",
        "Dataset B includes explicit requests for health/medical information (symptoms, treatments, risk factors) unlike A's technical/science-focused queries",
        "Dataset B contains more career/job search-related tasks (job listings, employment requirements) absent in A",
        "Dataset B tasks frequently require engagement with educational/DIY content (tutorials, parenting advice, woodworking projects) unlike A's academic/research focus",
        "Dataset B emphasizes practical daily life activities (event planning, recipe comparison, local venue finding) while A prioritizes scientific/statistical data extraction",
        "Dataset B includes explicit platform-specific feature requests (Google Settings adjustments, Chrome download) that A avoids",
        "Dataset B tasks more frequently involve personal location-based needs (near me, local venues, directions) compared to A's geographic data analysis",
        "Dataset B contains language learning/translation tasks absent in A's technical documentation focus",
        "Dataset B includes political/news analysis requests (war coverage, Trump news) while A focuses on neutral factual reporting",
        "Dataset B features fashion/trend-related queries (clothing trends, Gucci shows) not present in A's technical product comparisons"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks frequently involve personal lifestyle activities (e.g., recipes, parenting advice, language learning) while A focuses on technical/professional data extraction",
        "B includes explicit user-generated content interaction (e.g., rating recipes, sharing articles) absent in A's tasks",
        "B requires location-aware searches (e.g., 'near me', specific city venues) not emphasized in A",
        "B contains health management tasks (e.g., condition-specific recipes, symptom research) unlike A's technical specifications",
        "B features transactional objectives (e.g., ticket purchases, job applications) whereas A focuses purely on information retrieval",
        "B emphasizes real-time personal decision making (e.g., stock comparisons, hotel selection) vs A's static factual reporting",
        "B tasks often require interacting with niche platforms (e.g., Supercook, Duolingo) beyond A's common services (GitHub, IMDb)",
        "B includes explicit credibility verification of non-academic sources (e.g., blog authors) while A assumes pre-verified authority",
        "B involves multi-session persistence (e.g., 'start a course', 'maintain recipe database') unlike A's single-session extractions",
        "B tasks frequently require parameter customization (e.g., dietary restrictions, event amenities) absent in A's predefined queries"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset A tasks focus on extracting exact numerical or factual data (e.g., dates, measurements, counts), while Dataset B emphasizes exploratory understanding or procedural actions (e.g., research summaries, event bookings).",
        "Dataset A tasks often require direct copy-paste operations (e.g., SHA codes, specific metrics), whereas Dataset B tasks involve broader information synthesis (e.g., trends, concepts).",
        "Dataset A tasks prioritize real-time or dynamic data validation (e.g., \"as of today's date\"), while Dataset B tasks reference recent but non-immediate timeframes (e.g., \"this year\").",
        "Dataset A includes explicit multi-step verification across platforms (e.g., FlightAware + search results), whereas Dataset B relies on single-source exploration (e.g., Google Research publications).",
        "Dataset A tasks frequently involve structured hierarchical navigation (e.g., search \u2192 GitHub \u2192 commit history), while Dataset B tasks focus on open-ended discovery (e.g., \"explore stock performance\").",
        "Dataset A emphasizes domain-specific technical outputs (e.g., hardware requirements, planetary classifications), while Dataset B includes practical applications (e.g., job applications, recipe ingredients).",
        "Dataset B contains interactive transactional tasks (e.g., booking tickets, signing up for newsletters) absent in Dataset A.",
        "Dataset A tasks demand precision in temporal constraints (e.g., \"year before last\"), whereas Dataset B uses relative temporal language (e.g., \"recent\", \"latest\") without strict recency enforcement.",
        "Dataset B includes tasks requiring content contribution (e.g., Wikipedia edits, recipe database updates), while Dataset A focuses solely on extraction.",
        "Dataset A tasks target discrete ranked outcomes (e.g., \"top 3 planets\"), whereas Dataset B involves qualitative comparisons (e.g., stock performance narratives without explicit rankings)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Dataset B tasks frequently involve transactional actions (e.g., booking, purchasing, configuring products) while Dataset A focuses on passive data extraction.",
        "Dataset B includes tasks requiring localized or proximity-based results (e.g., 'near Palo Alto') whereas Dataset A tasks are geographically neutral.",
        "Dataset B tasks often involve lifestyle or consumer-oriented goals (e.g., recipes, parenting tips) compared to Dataset A's technical/scientific focus.",
        "Dataset B tasks explicitly require interacting with user-generated content (e.g., community forums, event inspiration blogs) while Dataset A prioritizes authoritative sources.",
        "Dataset B includes health-related information retrieval (e.g., symptoms, treatment options) absent in Dataset A.",
        "Dataset B tasks more commonly involve multi-source comparison (e.g., translation outputs, product configurations) than Dataset A's single-source verification.",
        "Dataset B requires parsing instructional/how-to content (e.g., tutorials, project guides) unlike Dataset A's factual record extraction.",
        "Dataset B tasks emphasize real-time availability checks (e.g., hotel rooms, event tickets) whereas Dataset A focuses on static factual accuracy.",
        "Dataset B includes career/job-related searches (e.g., remote software engineer jobs) not present in Dataset A.",
        "Dataset B tasks frequently target ephemeral or trend-based information (e.g., fashion trends, anticipated movies) compared to Dataset A's historical/enduring data focus."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Dataset A tasks primarily require extracting exact numerical/technical specifications (e.g., hardware requirements, SHA codes, distances) while Dataset B focuses on retrieving descriptive information (e.g., definitions, advice, event ideas).",
        "Dataset B includes tasks involving direct interaction with transactional elements (e.g., purchasing tickets, checking hotel prices) absent in Dataset A.",
        "Dataset A emphasizes retrieving time-sensitive dynamic data (e.g., latest scores, current planetary positions) whereas Dataset B more frequently involves static educational/health resources (e.g., disease causes, academic programs).",
        "Dataset B contains tasks requiring navigation through opinion-based content (e.g., product reviews, recipe rankings) while Dataset A focuses on objective comparisons of factual entities (e.g., movie ratings, planetary characteristics).",
        "Dataset A tasks demand extraction of precise identifiers (e.g., version numbers, proper names) whereas Dataset B more often requires paraphrasing conceptual information (e.g., AI principles, machine learning explanations).",
        "Dataset B includes explicit instructions for interface exploration (e.g., testing modals, managing browser settings) not present in Dataset A's search-and-extract paradigm.",
        "Dataset A tasks frequently involve temporal constraints for verification (e.g., 'as of today's date') while Dataset B more commonly references spatial constraints (e.g., location-based venue searches).",
        "Dataset B contains tasks requiring synthesis of procedural guidance (e.g., software installation steps, language settings management) absent from Dataset A's single-source extraction focus.",
        "Dataset A prioritizes authoritative technical sources (e.g., GitHub repositories, scientific databases) while Dataset B more often utilizes commercial/consumer platforms (e.g., hotel booking sites, app stores).",
        "Dataset B includes tasks requiring comparison of subjective metrics (e.g., stock market performance, product features) whereas Dataset A comparisons focus on objective rankings (e.g., movie ratings, planetary data)."
      ]
    }
  },
  "diffs_real_from_synth": {
    "google_maps": {
      "nnetnav_live_site=google_maps_num_tasks=75_portion=2": [
        "Dataset B tasks require generating printable outputs (e.g., PDF maps) or summarized information reports, while A focuses on in-app decision-making",
        "Dataset B contains tasks requiring analysis of proportional statistics (e.g., 'least proportion in reviews') not present in A",
        "Dataset B emphasizes specific parking types (motorcycle, EV charging, 24-hour) as primary search criteria rather than general amenities",
        "Dataset B includes explicit requirements to verify negative operational constraints (e.g., 'isn't open 24 hours') more frequently than A",
        "Dataset B tasks demand precise quantitative thresholds (e.g., 'ratings >4.8', '5 beauty salons') more rigorously than A's general comparisons",
        "Dataset B requires identification of infrastructure hierarchies (e.g., airport levels) not present in A's location queries",
        "Dataset B contains tasks focused on product category availability (e.g., 'kids' and maternity products') rather than service amenities",
        "Dataset B includes explicit requirements to extract and report numerical route details (distance, duration, steps) beyond basic directions",
        "Dataset B tasks frequently specify zip code-based searches rather than A's landmark/neighborhood-centric queries",
        "Dataset B requires identification of manufacturer-specific amenities (e.g., Tesla chargers) rather than general category filters"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=3": [
        "Dataset B tasks require generating printable outputs (e.g., PDF maps, printed route details) as part of instructions, unlike A",
        "Dataset B includes explicit requests for statistical comparisons (e.g., 'least proportion in reviews') not present in A",
        "Dataset B tasks specify exact quantity requirements (e.g., '5 beauty salons') rather than qualitative thresholds like 'multiple options' in A",
        "Dataset B emphasizes parking infrastructure characteristics (24-hour operations, non-24hr garages) more granularly than A's general parking availability queries",
        "Dataset B contains more specific transportation mode requirements (motorcycle/bicycle parking, EV charger types) compared to A's general transit hub searches",
        "Dataset B tasks frequently use zip codes as primary geographic identifiers rather than neighborhoods/landmarks prevalent in A",
        "Dataset B includes explicit vehicle type specifications (Tesla Destination Chargers) not found in A's general EV charging queries",
        "Dataset B requires analysis of facility hierarchy levels (airport terminal levels) absent from A's vertical structure tasks",
        "Dataset B tasks demand comparative route optimization metrics ('least walking') rather than A's basic route planning",
        "Dataset B specifies commercial chain stores (Best Buy) as targets more frequently than A's generic business category searches"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=1": [
        "Dataset B tasks include generating/saving physical/digital documents (e.g., 'print map as PDF') while Dataset A focuses on digital actions without document creation",
        "Dataset B requires information summarization from map data (e.g., 'summarize the information on the map') unlike Dataset A's focus on specific attribute retrieval",
        "Dataset B contains tasks analyzing review distribution patterns (e.g., 'which level has least proportion in reviews') while Dataset A focuses on review content evaluation",
        "Dataset B includes specific infrastructure queries (EV charging stations, motorcycle parking) not present in Dataset A",
        "Dataset B tasks explicitly request operational hour exceptions (e.g., 'parking that isn't open 24 hours') while Dataset A focuses on general hour verification",
        "Dataset B contains product category searches within retail locations (e.g., 'kids' and maternity products') absent from Dataset A's service-oriented searches",
        "Dataset B features exact numeric thresholds for ratings (e.g., 'greater than 4.8') compared to Dataset A's range-based thresholds (e.g., '4.5 stars or higher')",
        "Dataset B includes spatial relationship queries using zip codes as anchors (e.g., 'within 2 miles of zip code 80202') while Dataset A uses landmarks/street addresses",
        "Dataset B tasks require identification of specific corporate entities (e.g., 'Best Buy store', 'Tesla Destination Charger') unlike Dataset A's generic category searches",
        "Dataset B contains hierarchical facility analysis (e.g., airport level comparisons) not found in Dataset A's single-location focus"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=0": [
        "Dataset B tasks require generating or saving map data outputs (e.g., PDF printing, route detail documentation)",
        "Dataset B includes explicit requirements to analyze spatial distribution patterns (e.g., review proportions across facility levels)",
        "Dataset B tasks focus more on infrastructure-specific searches (parking types, EV charging, motorcycle/bicycle parking)",
        "Dataset B contains tasks requiring identification of commercial chains/branded locations (e.g., Best Buy, Tesla Destination Charger)",
        "Dataset B emphasizes exact operational hour verification for facilities (24-hour vs non-24-hour availability)",
        "Dataset B tasks require direct comparison of physical infrastructure characteristics (parking lot sizes, level distributions)",
        "Dataset B includes explicit zip code-based proximity searches rather than landmark-based references",
        "Dataset B tasks demand specific numerical thresholds for results (exact rating thresholds like >4.8)",
        "Dataset B requires identification of transportation infrastructure hierarchy (nearest/most convenient stops to intersections)",
        "Dataset B tasks focus on energy/service infrastructure compatibility (EV charging support at specific locations)"
      ],
      "nnetnav_live_site=google_maps_num_tasks=75_portion=4": [
        "Dataset B includes tasks requiring data export/printing (e.g., 'print map as PDF') while Dataset A focuses purely on digital information consumption",
        "Dataset B contains analytical tasks comparing review proportions across categories (e.g., 'which level has least proportion in reviews') not present in Dataset A",
        "Dataset B emphasizes specific infrastructure requirements (EV charging stations, motorcycle parking) rather than general accessibility features",
        "Dataset B requires exact numerical answers (e.g., '5 beauty salons', '2000 ft') while Dataset A uses relative qualifiers ('best', 'highly-rated')",
        "Dataset B includes tasks analyzing parking logistics (24-hour availability, non-24hr garages) rather than general availability checks",
        "Dataset B focuses on user comment analysis ('review user comments about it') where Dataset A uses reviews for filtering purposes only",
        "Dataset B contains specific retail product requirements (kids/maternity products) absent from Dataset A's service-oriented tasks",
        "Dataset B includes nature reserve information gathering tasks rather than urban itinerary planning",
        "Dataset B requires identification of single nearest points (e.g., 'one bus stop nearest') versus Dataset A's preference for multiple options",
        "Dataset B features brand-specific infrastructure queries (Tesla Destination Charger) not found in Dataset A"
      ]
    },
    "github": {
      "nnetnav_live_site=github_num_tasks=71_portion=3": [
        "Tasks in B require identifying specific courses in GitHub Skills (e.g., 'Resolve merge conflicts course') with explicit action details, while A focuses on general educational resources (e.g., onboarding, student packs).",
        "Tasks in B demand reporting exact numerical metrics (e.g., 'highest number of forks', 'top three contributors') from repositories, whereas A emphasizes understanding feature capabilities (e.g., Copilot pricing tiers).",
        "B includes tasks to identify time-bound repository attributes (e.g., 'updated within the last 2 days', 'created in January 2023'), while A's time-related criteria are broader (e.g., 'recency' without specific thresholds).",
        "B requires extracting structured data from trending/ranked lists (e.g., 'currently ranked first this month'), whereas A lacks tasks involving leaderboard-style comparisons.",
        "Tasks in B involve identifying repository file-level changes (e.g., 'files changed in the most recent commit'), while A focuses on commit history analysis at a contributor level.",
        "B includes explicit requirements to report project descriptions and purposes (e.g., 'project's purpose and programming language'), whereas A's repository searches focus on technical attributes (stars, language).",
        "Tasks in B require filtering repositories by both technical and domain-specific criteria (e.g., 'climate change data visualization'), while A's repository searches use generic technical filters (language, stars).",
        "B contains tasks to locate specific educational workflow steps (e.g., 'actions learners will perform in this course'), while A's educational tasks focus on resource discovery rather than procedural details.",
        "Tasks in B demand comparison of version-specific attributes (e.g., 'latest release version of React'), whereas A's comparisons focus on plan tiers rather than software versions.",
        "B includes explicit requirements to identify closed issues (e.g., 'last three issues closed'), while A's issue-related tasks focus on general management rather than resolution status."
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=2": [
        "Tasks in B require locating specific GitHub Skills course content and learning objectives (e.g., 'Resolve merge conflicts' course actions)",
        "Tasks in B demand identification of exact numerical rankings in dynamic content (e.g., 'currently ranked first this month' in Trending)",
        "Tasks in B require comparison of quantitative storage differences between enterprise tiers (e.g., 'package storage Enterprise vs Team')",
        "Tasks in B involve identifying precise time-bound repository creation criteria (e.g., 'initiated in January 2023')",
        "Tasks in B require analysis of commit-level file changes in repositories (e.g., 'files changed in most recent commit')",
        "Tasks in B necessitate evaluation of multiple quantitative filters simultaneously (e.g., '500+ stars AND updated past 2 days')",
        "Tasks in B require identification of niche technical domains in repository searches (e.g., 'protein prediction', 'natural language processing in Ruby')",
        "Tasks in B demand verification of course structure hierarchies (e.g., 'courses under First day on GitHub heading')",
        "Tasks in B require tracking of real-time release information (e.g., 'latest release version and publish date')",
        "Tasks in B involve mobile-specific feature verification (e.g., 'Copilot chat mobile usage timing')"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=0": [
        "Tasks in B require identifying specific numerical metrics (e.g., stars, forks, contributor counts) rather than general feature exploration",
        "B tasks involve locating time-bound content (e.g., 'last 15 days', 'January 2023') not present in A",
        "B requires extracting ranked lists (e.g., 'top three contributors', 'most popular') while A focuses on comparisons without ranking",
        "Tasks in B demand precise identification of temporal project details (creation/update dates) absent in A",
        "B includes tasks requiring analysis of repository commit histories and file changes, unlike A",
        "B tasks focus on locating specific educational course components (e.g., 'actions learners will perform') not seen in A",
        "Dataset B requires identification of exact contributor names and project maintainers, while A does not",
        "B tasks involve searching for projects using compound technical criteria (e.g., 'natural language processing in Ruby') rather than single filters",
        "B requires reporting specific version/release information (e.g., 'latest release version and date') not present in A",
        "Tasks in B demand validation of project currency (e.g., 'newly created', 'updated within last week') through temporal filters absent in A"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=4": [
        "Dataset A tasks focus on account actions (sign-up, trial requests) while B focuses on content discovery without account interactions",
        "Dataset B requires identification of repository contributors and trending rankings, absent in A",
        "Dataset A tasks involve policy documentation (terms, data handling) not present in B",
        "Dataset B emphasizes time-bound repository filters (last X days/weeks) for project discovery",
        "Dataset A contains specific security vulnerability lookups (CVE-IDs) while B focuses on general security tool popularity",
        "Dataset B requires identification of course structures/activities in GitHub Skills, absent in A",
        "Dataset A tasks include plan upgrade paths (Free\u2192Pro) not present in B",
        "Dataset B emphasizes quantitative repository metrics (stars count thresholds, contributor rankings)",
        "Dataset A focuses on Copilot feature eligibility/limitations while B focuses on Copilot documentation FAQs",
        "Dataset B requires identification of release versions/dates from repositories, absent in A"
      ],
      "nnetnav_live_site=github_num_tasks=71_portion=1": [
        "Dataset A tasks focus on internal GitHub product features and documentation navigation, while Dataset B focuses on external repository exploration and community content",
        "Dataset A requires interacting with help/documentation sections for feature usage, while Dataset B requires searching/filtering public repositories",
        "Dataset A tasks involve comparing subscription plans and account management, while Dataset B tasks involve analyzing repository statistics (stars/forks/contributors)",
        "Dataset A emphasizes security feature documentation navigation, while Dataset B emphasizes finding specific technical projects/codebases",
        "Dataset A contains tasks related to Copilot trial/plan upgrades, while Dataset B contains tasks about trending/open-source project discovery",
        "Dataset A tasks require parsing API documentation comparisons, while Dataset B tasks require evaluating repository activity timelines",
        "Dataset A focuses on customer story discovery from known companies, while Dataset B requires identifying unknown popular repositories",
        "Dataset A involves navigating structured product sections (Actions/Security), while Dataset B involves exploratory repository searches with multiple filters",
        "Dataset A tasks center on feature capabilities understanding, while Dataset B tasks center on repository metadata analysis",
        "Dataset A contains subscription management tasks, while Dataset B contains contributor/release history investigation tasks"
      ]
    },
    "espn": {
      "nnetnav_live_site=espn_num_tasks=62_portion=0": [
        "Dataset A tasks focus on retrieving real-time/in-progress game updates while Dataset B emphasizes upcoming/pre-game information",
        "Dataset A requires finding postponed game statuses whereas Dataset B tasks involve analyzing betting spreads/odds",
        "Dataset A includes requests for player-specific injury reports while Dataset B focuses on team transaction tracking",
        "Dataset B tasks require navigation through ranked college team designations (e.g. '9 CONN') not present in Dataset A",
        "Dataset A contains requests for quarter/period-specific play details while Dataset B requires conference standings analysis",
        "Dataset B tasks involve comparing team records with ranking prefixes (e.g. '5 ALA 12-2') absent in Dataset A",
        "Dataset A includes timezone-specific scheduling while Dataset B focuses on network/channel information for broadcasts",
        "Dataset B requires navigation through tournament seeding numbers (e.g. '7 Notre Dame 13-1') not featured in Dataset A",
        "Dataset A tasks involve parsing multi-sport final scores while Dataset B emphasizes head-to-head matchup previews",
        "Dataset B contains requests for player vs player matchup analysis (tennis) not present in Dataset A tasks"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=4": [
        "Tasks in B require conditional comparisons of game statistics (e.g., 'loser high > winner high') not present in A",
        "B includes mathematical calculations (e.g., games played percentage) not required in A's tasks",
        "B requires pattern matching for team names (e.g., 'New' or 'Golden') while A focuses on direct lookups",
        "B tasks demand synthesis of schedule data with conference standings unlike standalone queries in A",
        "B contains specific ESPN+ content exploration tasks absent from A's requirements",
        "B features tighter temporal constraints (e.g., 'within past 2 days') compared to A's broader 2-7 day ranges",
        "B requires game summary generation including top scorers where A only needs score reporting",
        "B tasks involve positional role identification (e.g., 'position on team') not required in A",
        "B includes data consistency checks across players (e.g., matching GP%) absent from A's tasks",
        "B emphasizes recent transaction tracking (trades/acquisitions) while A focuses on general time-sensitive content"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=1": [
        "Dataset B tasks require complex data comparisons (e.g., loser's score > winner's score) not present in Dataset A",
        "Dataset B includes mathematical operations (e.g., percentage calculations) within task requirements",
        "Dataset B tasks demand pattern recognition in team names (e.g., 'New' or 'Golden' in names)",
        "Dataset B requires identification of data consistency across multiple players/teams",
        "Dataset B tasks involve historical career statistics analysis (e.g., LeBron's total games)",
        "Dataset B requires synthesis of brief game summaries from multiple data points",
        "Dataset B tasks necessitate understanding of ESPN+ tool functionalities beyond content access",
        "Dataset B includes explicit conference-specific standing analysis (e.g., Western Conference leaders)",
        "Dataset B tasks require temporal filtering with precise date ranges (e.g., past 2 days)",
        "Dataset B demands identification of positional roles (e.g., player positions) in statistical analysis"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=3": [
        "Dataset B tasks require retrieval of specific player/team transactional updates (e.g., trades, injuries) within defined time windows (e.g., past week)",
        "Dataset B includes tasks involving mathematical calculations/percentages derived from player statistics (e.g., games played percentage)",
        "Dataset B requires identification of teams based on name patterns (e.g., 'New' or 'Golden' in team names)",
        "Dataset B tasks demand explicit identification of broadcast dates alongside game results",
        "Dataset B requires direct comparison of player statistics against team roster data",
        "Dataset B includes queries about ESPN+ service features/content categorization",
        "Dataset B tasks involve counting/classification of league/team quantities from structural menus",
        "Dataset B requires identification of positional rankings (e.g., 'top scorer in Western Conference')",
        "Dataset B tasks specify verification of player roster positions alongside statistical data",
        "Dataset B includes explicit requests for metadata about ESPN's content organization (e.g., number of featured leagues)"
      ],
      "nnetnav_live_site=espn_num_tasks=62_portion=2": [
        "Dataset B tasks require analytical computations (e.g., calculating player GP percentages) absent in Dataset A",
        "Dataset B includes explicit requests for cross-league comparisons (e.g., NHL/NBA team name patterns) not seen in Dataset A",
        "Tasks in B demand temporal specificity (e.g., 'within past 2 days') more frequently than A's general date filtering",
        "B emphasizes ESPN+ platform features (e.g., 'ESPN+ Tools summary') while A focuses on core ESPN content",
        "B requires identification of positional/role-based metrics (e.g., 'top scorer's team position') beyond basic stat retrieval in A",
        "Tasks in B often involve compound queries (e.g., 'list teams with \"New\" + NBA verification') requiring multi-source validation",
        "B contains explicit conference-specific standing requests (e.g., 'Western Conference rebounds leaders') versus A's general standings",
        "Dataset B tasks require content summarization (e.g., 'brief game summary') rather than pure data extraction in A",
        "B includes meta-analysis of website structure (e.g., 'count sports leagues on homepage') absent in A's sport-specific navigation",
        "Dataset B features hypothetical scenario testing (e.g., 'loser high > winner high') requiring comparative game stat analysis"
      ]
    },
    "huggingface": {
      "nnetnav_live_site=huggingface_num_tasks=76_portion=1": [
        "Dataset A tasks focus on technical implementation steps (e.g., model conversion, GPU usage), while B focuses on information retrieval and summarization",
        "Dataset A requires navigation through nested community features (discussions, followers), while B emphasizes quantitative metric extraction (stars, downloads)",
        "Dataset A contains tasks requiring API endpoint configuration/authentication, while B focuses on API consumption without setup details",
        "Dataset A includes cross-modal conversion tasks (paper\u2192HTML), while B focuses on single-modality information extraction",
        "Dataset A tasks involve commercial viability verification (licensing for products), while B emphasizes license type identification",
        "Dataset A requires temporal comparisons of model versions, while B focuses on absolute recency (latest as of date)",
        "Dataset A contains dataset structure analysis tasks (row counts, splits), while B focuses on dataset popularity metrics",
        "Dataset A tasks involve multi-step library integration (Colab setup), while B focuses on documentation lookup",
        "Dataset A includes model architecture analysis tasks, while B emphasizes application scenarios and use cases",
        "Dataset A requires enterprise feature comparisons (security tiers), while B focuses on personal account plans"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=0": [
        "Dataset B tasks require identifying models/datasets with popularity metrics (most downloaded/highest likes)",
        "Dataset B includes content generation tasks using APIs (e.g. story generation)",
        "Dataset B requires format conversion instructions between frameworks (PyTorch\u2194TensorFlow)",
        "Dataset B emphasizes temporal recency constraints (e.g. 'last updated in March 2023')",
        "Dataset B contains summarization requirements for multi-tier pricing/feature comparisons",
        "Dataset B requires direct content inspection of dataset entries (e.g. first message content)",
        "Dataset B includes tracking of research paper updates (daily papers with voting metrics)",
        "Dataset B tasks specify technical implementation parameters (e.g. Trainer API configuration)",
        "Dataset B emphasizes license type comparisons beyond basic retrieval (e.g. CC-BY-SA-4.0)",
        "Dataset B requires explicit reporting of quantitative evaluation metrics"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=4": [
        "Dataset B tasks emphasize summarization of payment plans/features while Dataset A focuses on finding enterprise/pricing information",
        "Dataset B requires identifying models with specific release date ranges (e.g. 'within March 2023') not seen in Dataset A tasks",
        "Dataset B contains tasks about newest platform features (smolagents, LM Studio/Ollama integration) not present in Dataset A",
        "Dataset B includes explicit requirements to report numerical evaluation metrics from model cards unlike Dataset A",
        "Dataset B tasks demand analysis of monetization options (Pro account costs) while Dataset A focuses on technical specs",
        "Dataset B requires content generation through Inference API (e.g. stories) not present in Dataset A's retrieval-focused tasks",
        "Dataset B emphasizes popularity metrics within license categories (e.g. 'most likes for Apache-2.0') more than Dataset A",
        "Dataset B tasks involve newer model types (image-to-video) and modalities not mentioned in Dataset A samples",
        "Dataset B requires direct Dataset Viewer inspection of specific content entries unlike Dataset A's metadata-focused tasks",
        "Dataset B includes framework conversion steps documentation lookups (PyTorch->TensorFlow) not in Dataset A"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=2": [
        "Dataset B tasks require summarizing payment plan structures while Dataset A focuses on individual pricing tier details",
        "Dataset B includes tasks involving creative content generation through API interactions (e.g., stories) while Dataset A focuses on technical API usage",
        "Dataset B contains explicit requirements for CC-BY-SA-4.0 license verification while Dataset A uses broader license types",
        "Dataset B tasks emphasize identifying 'latest updated' resources with temporal precision while Dataset A uses relative time references",
        "Dataset B requires comparison of educational resource benefits (e.g., Classroom) not present in Dataset A",
        "Dataset B includes explicit 'most downloaded' ranking requirements while Dataset A uses general popularity metrics",
        "Dataset B contains tasks requiring Dataset Viewer inspection while Dataset A focuses on model card analysis",
        "Dataset B includes framework conversion tasks (PyTorch\u2192TensorFlow) not present in Dataset A",
        "Dataset B requires daily research paper monitoring tasks absent from Dataset A",
        "Dataset B tasks involve multilingual dataset exploration (e.g., English/Japanese) while Dataset A focuses on single-language use cases"
      ],
      "nnetnav_live_site=huggingface_num_tasks=76_portion=3": [
        "Dataset B tasks require generating creative content (e.g., stories) through API usage while Dataset A focuses on technical implementation",
        "Dataset B explicitly requests GitHub star counts for documentation libraries unlike Dataset A",
        "Dataset B includes analysis of educational resources (Hugging Face Classroom) absent in Dataset A",
        "Dataset B tasks emphasize social engagement metrics (upvotes) more prominently than Dataset A",
        "Dataset B requires identifying models for niche domains (travel chats, recipe generation) while Dataset A focuses on general NLP tasks",
        "Dataset B contains explicit requirements for creative commons licenses (cc-by-sa-4.0) not emphasized in Dataset A",
        "Dataset B tasks demand direct API interaction for content generation rather than just documentation lookup",
        "Dataset B includes time-bound requirements (e.g., March 2023 updates) not present in Dataset A tasks",
        "Dataset B requires framework conversion steps (PyTorch to TensorFlow) not mentioned in Dataset A",
        "Dataset B tasks involve analyzing community discussion features (daily papers with voting) absent in Dataset A"
      ]
    },
    "coursera": {
      "nnetnav_live_site=coursera_num_tasks=72_portion=3": [
        "Dataset B tasks require structured data extraction of exact course durations and weekly time commitments (e.g. '5 hours/week') rather than general estimates",
        "Dataset B emphasizes verification of instructor credentials through biographical summaries rather than basic credential identification",
        "Tasks in B demand identification of regional academic partnerships (e.g. 'universities from Australia') not required in A",
        "B requires sorting/filtering by specific publication dates (e.g. 'sort by newest') for course discovery",
        "B contains tasks requiring identification of application deadlines for degree programs not present in A",
        "Dataset B tasks specify higher rating thresholds (e.g. 'at least 4.5 stars') compared to general rating checks in A",
        "B requires extraction of specific module-level content details (e.g. 'number of videos in module 2')",
        "Tasks in B demand verification of learner testimonials for Specializations not required in A",
        "B includes requirements to identify complete program structures (e.g. 'list courses in Specialization') rather than single course components",
        "Dataset B tasks focus on degree program discovery (e.g. '3 Bachelor's degrees') rather than individual course credentials"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=2": [
        "Dataset B tasks require identification of specific course modules (e.g., Agile methodology, Measuring Sustainability) within curriculum details",
        "Dataset B queries demand verification of participant testimonials/reviews in course descriptions",
        "Dataset B requires sorting/filtering by course release date or 'newest' status in search results",
        "Dataset B tasks involve identifying regional partner institutions (e.g., Australian university partners)",
        "Dataset B requires extraction of video-level content details within course modules",
        "Dataset B tasks demand explicit comparison of learning outcomes between Specializations",
        "Dataset B includes verification of promotional offers (e.g., $199/year subscription) in course enrollment",
        "Dataset B requires identification of degree program application deadlines from external resources",
        "Dataset B tasks involve differentiating between Specialization types (e.g., Professional Certificate vs. University Specialization)",
        "Dataset B queries require identification of course components specifically marked as 'new AI skills' in titles/descriptions"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=4": [
        "Dataset B tasks emphasize identification of regional affiliations (e.g., Australian universities) in partner institution filtering",
        "Dataset B requires extraction of testimonials or learner feedback for specific programs/certificates",
        "Dataset B tasks involve granular module-level metadata (e.g., video counts per module, specific video titles)",
        "Dataset B queries focus on course/specialization objectives and detailed instructor background beyond basic bios",
        "Dataset B requires identification of application deadlines and credit transfer policies for degree programs",
        "Dataset B tasks demand sorting/filtering by course newness (e.g., 'sort by newest')",
        "Dataset B includes verification of certification authority/issuing body details beyond basic credential types",
        "Dataset B requires extraction of quantitative review metrics (number of reviews) alongside ratings",
        "Dataset B tasks specify exact time commitment requirements (e.g., <20 hours completion time)",
        "Dataset B emphasizes structured learning outcome verification (e.g., 'list courses in specialization')"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=0": [
        "Dataset B tasks require identifying specific course modules and their content (e.g., video counts, module names), while A focuses on general course attributes.",
        "Dataset B tasks explicitly request instructor biographies and other courses taught by them, whereas A only mentions extracting instructor details generally.",
        "Dataset B includes tasks requiring identification of country-specific institutional partnerships (e.g., Australian universities), while A focuses on general institutional affiliations.",
        "Dataset B tasks demand strict rating thresholds (e.g., '4+ stars') as filters, while A mentions prioritizing high ratings without explicit thresholds.",
        "Dataset B requires verification of course testimonials/reviews, which is not present in A's task characteristics.",
        "Dataset B tasks specify sorting mechanisms (e.g., 'sort by newest'), while A focuses on filtering without explicit sorting requirements.",
        "Dataset B includes queries about application deadlines for degree programs, which A's tasks do not mention.",
        "Dataset B tasks request detailed learning outcomes for specializations, while A focuses on comparing program types without outcome specifics.",
        "Dataset B requires identification of exact video/content names within course modules, whereas A lacks structural content details.",
        "Dataset B tasks explicitly ask for review counts ('number of reviews received'), while A only prioritizes reviews without quantification"
      ],
      "nnetnav_live_site=coursera_num_tasks=72_portion=1": [
        "Dataset B tasks require identifying specific regional partner institutions (e.g., Australian universities/companies) while A focuses on global partners",
        "Tasks in B emphasize extracting detailed instructor biographies and academic/professional backgrounds compared to A's focus on general credentials",
        "B requires locating exact module video titles and counts within courses, while A focuses on broader curriculum component identification",
        "Tasks in B demand identification of sustainability-focused course modules (e.g., 'Measuring Sustainability') not specified in A",
        "B includes requirements to locate program testimonials/reviews within Specialization pages, absent from A's tasks",
        "Dataset B tasks require filtering for country-specific degree application deadlines, unlike A's general admission requirements",
        "B emphasizes identifying newest courses through explicit sorting by release date/recently added filters not mentioned in A",
        "Tasks in B specify verifying course completion badges/digital credentials format, while A focuses on certification availability",
        "B requires identification of 3D printing/IoT hardware implementation components in course descriptions not present in A's tech focus",
        "Dataset B tasks demand verification of exact weekly time commitment calculations based on variable study hours, while A uses fixed duration comparisons"
      ]
    },
    "arxiv": {
      "nnetnav_live_site=arxiv_num_tasks=80_portion=1": [
        "Dataset B tasks require summarization of paper objectives/hypotheses not present in Dataset A",
        "Dataset B includes tasks requesting quantitative results (e.g. publication counts) absent in Dataset A",
        "Dataset B contains tasks about non-English submission requirements for abstracts unlike Dataset A",
        "Dataset B requires identification of author affiliations while Dataset A focuses on author names only",
        "Dataset B tasks specify tighter date constraints (e.g. 'last two days') more frequently than Dataset A",
        "Dataset B includes comparative analysis tasks between search terms/phrases not seen in Dataset A",
        "Dataset B contains requests for organizational leadership information (e.g. arXiv leadership team)",
        "Dataset B tasks require analysis of technical content depth (e.g. formula counting) beyond Dataset A",
        "Dataset B includes category-specific publication frequency analysis requirements",
        "Dataset B tasks emphasize recent submission tracking more urgently than Dataset A's date ranges"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=4": [
        "Dataset B tasks require multi-step data aggregation (e.g., counting publications per category/timeframe) while Dataset A focuses on single-resource retrieval",
        "Dataset B includes administrative/operational tasks (e.g., leadership team identification, subscription management) absent in Dataset A",
        "Dataset B tasks demand detailed content analysis (e.g., formula identification, hypothesis extraction) beyond metadata retrieval in Dataset A",
        "Dataset B requires temporal precision constraints (e.g., 'within last two days', 'this week') more rigorously than Dataset A's general date ranges",
        "Dataset B contains explicit cross-category comparison tasks (e.g., search in specific category vs all archives) not present in Dataset A",
        "Dataset B tasks frequently require authorship verification through affiliation tracking while Dataset A focuses on basic author name matching",
        "Dataset B includes multilingual content handling requirements (e.g., non-English abstract guidelines) absent in Dataset A tasks",
        "Dataset B tasks involve citation pattern analysis (e.g., journal reference formatting) beyond basic citation extraction in Dataset A",
        "Dataset B requires quantitative analysis of search results (e.g., result counts, temporal distributions) unlike Dataset A's qualitative searches",
        "Dataset B tasks demand technical content interpretation (e.g., loss function identification) rather than just format handling in Dataset A"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=0": [
        "Dataset B tasks require summarization of paper objectives/hypotheses while Dataset A focuses on locating specific paper sections",
        "Dataset B includes explicit requests for publication count statistics within categories/timeframes not present in Dataset A",
        "Dataset B tasks require identification of author affiliations as primary objectives rather than secondary metadata extraction",
        "Dataset B contains queries about specific conference references (e.g. CVPR) not found in Dataset A tasks",
        "Dataset B includes requests for institutional leadership information (e.g. arXiv Leadership Team) absent from Dataset A",
        "Dataset B tasks explicitly probe multi-language abstract formatting requirements while Dataset A only references general policy location",
        "Dataset B requires analysis of paper content elements (e.g. formula counting) beyond metadata extraction in Dataset A",
        "Dataset B contains subscription management tasks (e.g. email alerts) through help documentation vs general navigation in Dataset A",
        "Dataset B tasks use more specific temporal constraints (e.g. 'within last two days') compared to Dataset A's relative date ranges",
        "Dataset B includes journal reference field searches while Dataset A focuses on title/abstract/keyword searches"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=2": [
        "Dataset B tasks require handling multi-language abstract requirements and non-English submissions",
        "Dataset B includes queries about arXiv's leadership team and organizational structure",
        "Dataset B tasks demand counting specific formula occurrences within papers",
        "Dataset B requires interpretation of conference-specific references (e.g. CVPR, ACL Workshop)",
        "Dataset B contains tasks needing comparison of search results across different query formulations",
        "Dataset B involves identifying loss functions and technical implementation details in methodologies",
        "Dataset B tasks require tracking submission dates relative to specific conference timelines",
        "Dataset B includes operational queries about email subscription management for category updates",
        "Dataset B tasks demand verification of author affiliations through metadata analysis",
        "Dataset B contains requirements to cross-reference submission dates with external event timelines"
      ],
      "nnetnav_live_site=arxiv_num_tasks=80_portion=3": [
        "Tasks in B require aggregating result counts or statistical metrics (e.g., 'how many formulas', 'how many results') while A focuses on direct retrieval without quantification.",
        "B includes tasks demanding summarization of paper objectives/hypotheses, whereas A tasks only extract predefined metadata (titles, authors, etc.).",
        "B requires identifying author affiliations explicitly, while A handles author variations without affiliation extraction.",
        "B involves queries about non-English submission guidelines (e.g., multi-language abstracts), which A does not address.",
        "B tasks demand navigation of submission policies (e.g., email subscriptions, leadership team info) beyond content retrieval in A.",
        "B requires comparing search term variations (e.g., 'CVPR 2023' vs. 'CVPR2023') to assess syntax sensitivity, unlike A's straightforward keyword usage.",
        "B tasks involve content interpretation (e.g., locating loss functions, identifying paper sections) rather than A\u2019s metadata extraction.",
        "B emphasizes stricter temporal constraints (e.g., 'within the last two days', 'this week') compared to A\u2019s general date ranges.",
        "B tasks require cross-archive comparison (e.g., category-specific vs. all archives) as explicit objectives, while A assumes single-archive filtering.",
        "B includes validation of publication status (e.g., 'originally announced in October 2023') as part of queries, whereas A focuses on current status."
      ]
    },
    "bbc": {
      "nnetnav_live_site=bbc_num_tasks=69_portion=2": [
        "Dataset B tasks require synthesizing information across multiple articles to create summaries, while Dataset A focuses on locating single pieces of information",
        "Dataset B includes explicit requirements to analyze multimedia elements (e.g. 'What is in the first picture'), while Dataset A focuses primarily on text content",
        "Dataset B tasks demand comparison of data across categories (e.g. country rankings in sports), whereas Dataset A tasks focus on retrieval without comparison",
        "Dataset B requires identification of content authorship and publication dates as part of task completion, unlike Dataset A",
        "Dataset B tasks involve evaluating economic/political implications of news events, while Dataset A focuses on factual retrieval",
        "Dataset B includes specific requests for statistical analysis (e.g. 'count which country has the most players'), absent in Dataset A",
        "Dataset B tasks require navigation through specialized content series (e.g. 'The SpeciaList'), while Dataset A uses more generic categories",
        "Dataset B contains tasks requiring synthesis of technical/scientific explanations (e.g. climate change causes), while Dataset A focuses on event reporting",
        "Dataset B includes cross-domain tasks combining sections (e.g. Weather and News), whereas Dataset A tasks stay within single domains",
        "Dataset B tasks demand real-time interpretation of dynamic content (e.g. sports leaderboards), while Dataset A focuses on static information retrieval"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=3": [
        "Tasks in dataset B require explicit summarization of article content (e.g. 'summarize key points'), while A focuses on information retrieval without synthesis",
        "Dataset B contains tasks requiring identification of authorship/publication dates, while A tasks lack metadata verification requirements",
        "B includes analytical tasks assessing regional impact (e.g. 'economic implications'), whereas A focuses on basic regional content location",
        "Tasks in B demand quantitative analysis of structured data (e.g. 'count which country has most players'), absent from A's requirements",
        "B contains explicit requests for multimedia content analysis (e.g. 'what is in the first picture'), while A only requires finding multimedia",
        "Dataset B tasks require comparative analysis across content categories (e.g. 'topics most news are about'), unlike A's single-topic focus",
        "B includes historical reference tasks (e.g. 'Notable deaths 2024'), while A focuses exclusively on current/recent information",
        "Tasks in B require navigation through multiple hierarchical levels (e.g. 'World > Africa > specific article'), whereas A uses flatter navigation",
        "Dataset B contains explicit verification tasks (e.g. 'find and confirm author'), while A focuses on simple information retrieval",
        "B includes tasks requiring cross-referencing between content types (e.g. 'economic implications from business news'), absent in A"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=1": [
        "Dataset B tasks require identifying article authors and publication dates, while Dataset A does not mention authorship details",
        "Dataset B includes tasks requiring analysis of economic implications, while Dataset A focuses on factual retrieval without interpretation",
        "Dataset B tasks specifically request multimedia content analysis (e.g. describing images), while Dataset A only requires general multimedia awareness",
        "Dataset B contains tasks requiring comparison of numerical data across categories (e.g. sports statistics), absent in Dataset A",
        "Dataset B tasks demand hierarchical navigation through specialized subsections (e.g. 'The SpeciaList'), while Dataset A uses broader categories",
        "Dataset B includes time-bound verification tasks (e.g. 'most recent tournament'), whereas Dataset A focuses on general recency",
        "Dataset B requires cross-referencing multiple content types (articles + podcasts), while Dataset A tasks stay within single content formats",
        "Dataset B tasks involve country-specific data aggregation (e.g. player nationalities), absent in Dataset A's regional focus",
        "Dataset B contains explicit requests for metadata analysis (e.g. publication date), while Dataset A only uses timestamps for recency",
        "Dataset B tasks require causal reasoning (e.g. impacts of tech layoffs), while Dataset A focuses on direct information retrieval"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=0": [
        "Dataset B tasks require summarization and synthesis of key points from articles, while Dataset A focuses on locating specific information without synthesis",
        "Dataset B includes tasks involving cross-sectional comparisons (e.g., regional leaderboards in sports), while Dataset A tasks remain single-section focused",
        "Dataset B requires explicit identification of multimedia content characteristics (e.g. 'what is in the first picture'), whereas Dataset A only requires basic multimedia identification",
        "Dataset B tasks frequently involve temporal analysis of content recency ('most recent report', 'latest development'), while Dataset A uses timestamps primarily for chronology",
        "Dataset B contains tasks requiring evaluation of content distribution patterns ('what topics most news are about'), absent in Dataset A's information retrieval focus",
        "Dataset B includes explicit geographic granularity requirements (country-level analysis in sports/golf), while Dataset A's geographic tasks remain regional/categorical",
        "Dataset B tasks require multi-step correlation of data points (e.g. player nationality + scores), whereas Dataset A tasks involve single-data-point extraction",
        "Dataset B features tasks requiring quantitative analysis (counting players/countries in leaderboards), unlike Dataset A's qualitative information retrieval",
        "Dataset B includes explicit author identification requirements, while Dataset A tasks never request authorship details",
        "Dataset B tasks involve comparative economic impact analysis ('stated impacts on European economies'), whereas Dataset A's economic tasks focus on basic situation reporting"
      ],
      "nnetnav_live_site=bbc_num_tasks=69_portion=4": [
        "Tasks in B require summarizing key points from articles more explicitly than A",
        "B includes tasks demanding extraction of numerical data (e.g. counts, rankings) from content more frequently than A",
        "B emphasizes identifying specific geographic locations (cities, regions) within articles more than A",
        "Tasks in B more frequently require matching visual elements (e.g. first picture) to article content than A",
        "B contains more tasks requiring temporal comparisons (e.g. most recent vs yesterday) in sports/events coverage",
        "B includes explicit requirements to identify author names and publication dates in metadata extraction",
        "Tasks in B more frequently demand cross-referencing between multiple content formats (text/images/data)",
        "B contains more direct requests for comparative analysis (e.g. country comparisons in leaderboards)",
        "Tasks in B require more precise identification of hierarchical category paths (e.g. Sport\u2192Football\u2192Tournament)",
        "B includes more tasks requiring verification through related content sections (e.g. 'OTHER TOP STORIES')"
      ]
    },
    "amazon": {
      "nnetnav_live_site=amazon_num_tasks=63_portion=2": [
        "Dataset B tasks require multi-attribute filtering combinations (e.g., price + technical specs + rating thresholds) while A uses single-attribute filters",
        "B contains tasks requiring temporal constraints (e.g., 2024 publication dates) not present in A's requirements",
        "B includes explicit product dimension specifications (e.g., 30\" length, 300 sq ft coverage) absent from A's tasks",
        "B tasks demand quantitative performance metrics verification (e.g., 10-hour battery life) unlike A's qualitative requirements",
        "B requires explicit sorting instruction execution (e.g., 'high to low') while A only references basic price sorting needs",
        "B contains tasks requiring inventory/availability cross-verification (e.g., specific color/size combinations) beyond A's basic checks",
        "B includes conditional logic requirements (e.g., 'if free return available') not present in A's linear tasks",
        "B tasks specify exact review quantity thresholds (e.g., 50+ reviews) vs A's general rating requirements",
        "B requires preservation/recall of intermediate results (e.g., 'save lowest priced') unlike A's single-action tasks",
        "B contains explicit spatial/measurement constraints (e.g., 2-3 quart capacity) while A uses generic size categories"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=3": [
        "Dataset B tasks require multi-attribute filtering combinations (price + technical specs + features)",
        "Dataset B tasks frequently specify exact numerical thresholds (e.g. '300 sq ft', '10-hour battery') not present in A",
        "Dataset B includes post-purchase verification tasks (return policies, delivery verification)",
        "Dataset B tasks require identification of real-time dynamic content (current deals/percentages)",
        "Dataset B emphasizes spatial/measurement constraints (room size capacity, product dimensions)",
        "Dataset B tasks require comparison of ranked results (top 3 results, first 3 after sorting)",
        "Dataset B includes explicit requirement to parse/save search result positions",
        "Dataset B tasks demand cross-referencing multiple quality metrics (ratings + review counts + price)",
        "Dataset B contains tasks requiring certification verification (energy efficiency ratings)",
        "Dataset B requires temporal awareness of product versions (2024 publications, latest releases)"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=1": [
        "Tasks in dataset B require handling specific product specifications (e.g., 'water-resistant design', '300 sq ft room size') not explicitly mentioned in dataset A",
        "Dataset B tasks involve precise numeric ranges (e.g., 'under $50', '2-3 quarts capacity') as mandatory filters rather than general price categories",
        "Tasks in B require validation of technical attributes (e.g., 'energy efficiency rating', 'RFID blocking') beyond basic product features",
        "Dataset B includes time-sensitive deal verification requirements (e.g., 'current offers with percentage discounts') not present in A",
        "Tasks in B demand explicit return policy checks and free return availability verification",
        "Dataset B requires handling publication date constraints (e.g., 'published in 2024') for media products",
        "B tasks involve multi-criteria sorting (e.g., 'price high to low' with specific result count requirements)",
        "Dataset B includes size-specific filtering with exact measurements (e.g., '30 inches length', 'size 6')",
        "Tasks in B require validation of review metrics (e.g., 'minimum 50 customer reviews') as hard constraints",
        "Dataset B contains tasks requiring preservation/recall of specific search results positions (e.g., 'first 3 results after sorting')"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=0": [
        "Tasks in dataset B require multi-criteria filtering (price + specific attributes) while A uses single criteria",
        "Dataset B tasks demand precise price range specifications (e.g., $40-$60) vs A's broader ranges (e.g., under $50)",
        "B requires technical specifications (e.g., battery life, room size) while A focuses on general product categories",
        "Tasks in B explicitly require comparison of ranked results (top 3) vs A's general sorting requirements",
        "B includes physical dimension requirements (e.g., 30\" length) absent in A's tasks",
        "Dataset B tasks specify exact review count thresholds (500+ reviews) vs A's general rating requirements",
        "B requires verification of specific technical features (RFID blocking, digital display) while A validates basic attributes",
        "Tasks in B involve procedural checks (return policies) not present in A's requirements",
        "Dataset B includes temporal constraints (2024 publications) while A focuses on general availability",
        "B tasks require color/size combination validation in product searches unlike A's single attribute checks"
      ],
      "nnetnav_live_site=amazon_num_tasks=63_portion=4": [
        "Dataset B tasks require multi-criteria filtering (e.g., price range + feature + rating) in a single query, while A focuses on single-criteria filtering.",
        "B tasks explicitly demand numerical value verification (e.g., '300 sq ft', '10 hours battery life'), whereas A uses qualitative ranges (e.g., 'cheapest', 'luxury').",
        "B includes time-sensitive price comparisons (e.g., 'compare top three results'), while A focuses on static price checks.",
        "B requires validation of specific technical specifications (e.g., 'energy efficiency rating', 'RFID blocking'), while A emphasizes general attribute verification.",
        "B tasks involve conditional availability checks (e.g., 'FREE delivery', 'anti-squirrel mechanism'), whereas A focuses on basic stock status.",
        "B mandates verification of quantitative customer feedback metrics (e.g., '50+ reviews'), while A prioritizes star ratings alone.",
        "B includes spatial/measurement constraints (e.g., '30 inches length', '2-3 quarts capacity'), which are absent in A's tasks.",
        "B requires explicit sorting/filtering actions (e.g., 'sort high to low'), while A implies default sorting behaviors.",
        "B tasks demand product version/year specificity (e.g., 'published in 2024'), unlike A's generic temporal references (e.g., 'Winter Sale').",
        "B incorporates procedural validation steps (e.g., 'how to return'), while A focuses on transactional actions (e.g., 'add to cart')."
      ]
    },
    "wolframalpha": {
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=4": [
        "Dataset B tasks require precise parameter specifications (e.g., 'spring equilibrium length=0.12m') while Dataset A uses generic problem descriptions.",
        "Dataset B includes multi-component queries requiring combined outputs (e.g., 'mass of Jupiter compared to Earth and day length') whereas Dataset A focuses on single-aspect requests.",
        "Dataset B contains explicit material science requests with standardized codes (e.g., 'UNS A92024') unlike Dataset A's generic material inquiries.",
        "Dataset B tasks demand parametric equation manipulation (e.g., 'rotated 33 degrees counterclockwise') while Dataset A focuses on basic function visualization.",
        "Dataset B requires specialized physics computations with real-world engineering contexts (e.g., hydroelectric power plant outputs) beyond Dataset A's theoretical physics questions.",
        "Dataset B includes precise temporal constraints (e.g., 'geomagnetic field on June 20, 2023') whereas Dataset A uses non-specific temporal references.",
        "Dataset B tasks involve complex chemical compound analysis with percentage composition calculations absent in Dataset A's basic molecular property requests.",
        "Dataset B contains Unicode character identification within specific numeric ranges, a technical computing aspect not present in Dataset A.",
        "Dataset B requires comparative nutritional analysis with assumed parameters (e.g., '300g serving size') while Dataset A requests singular metric calculations.",
        "Dataset B features advanced differential equation solutions with physical system modeling (e.g., spring pendulum dynamics) beyond Dataset A's standard equation solving."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=0": [
        "Dataset B tasks emphasize engineering and applied physics problems, whereas A includes humanities topics like linguistics and history.",
        "Tasks in B frequently require precise numerical parameters (e.g., specific lengths, angles, material codes), while A uses broader or generic values.",
        "B tasks often involve multi-step calculations within a single query (e.g., conversion followed by compositional analysis), unlike A's simpler stepwise requests.",
        "B includes tasks focused on material properties (e.g., resistivity of alloys) and physical measurements, which are absent in A.",
        "Tasks in B involve parametric equations, geometric transformations (e.g., rotated ellipses), and complex curve plotting, whereas A focuses on basic function derivatives or plots.",
        "B tasks demand solving higher-level differential equations (e.g., second-order ODEs) and integrals with explicit bounds, while A covers foundational algebraic equations.",
        "B tasks specify exact real-world conditions (e.g., dates, temperatures, geographic locations) for context-specific computations, unlike A's general scenarios.",
        "B includes niche technical tasks like identifying Unicode symbols within strict numerical ranges, whereas A\u2019s Unicode queries are definitional or conceptual.",
        "B tasks prioritize real-world engineering applications (e.g., hydroelectric energy output, spring pendulum dynamics), while A includes generic financial or health metrics.",
        "Tasks in B require outputs with strict unit specifications (e.g., percentage composition, resistivity units), while A often omits explicit unit constraints."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=1": [
        "Tasks in dataset B require multi-step calculations integrating multiple parameters (e.g., mass, spring constants, angles), while dataset A focuses on single-step computations or data retrieval.",
        "Dataset B tasks frequently involve exact numerical values (e.g., 15 kilograms, 20 degrees Celsius, 72 ppi), whereas dataset A tasks use more general or symbolic terms (e.g., \"last 10 years,\" \"ideal body weight\").",
        "Dataset B includes queries about specific real-world entities (e.g., Itaipu Dam, Jupiter, UNS material codes), while dataset A tasks reference generic categories (e.g., \"Mars,\" \"Hydrogen\").",
        "Tasks in dataset B demand outputs with explicit component breakdowns (e.g., percentage composition by weight, parametric equation plots), whereas dataset A emphasizes summary results (e.g., nutrition facts, step-by-step solutions).",
        "Dataset B tasks compare narrowly defined named entities (e.g., Whopper vs. Baconator vs. Big Mac), while dataset A comparisons involve broad categories (e.g., uranium half-life vs. sun lifetime).",
        "Dataset B requires geometric transformations (e.g., rotated ellipses) and parametric equations (e.g., Einstein curve), whereas dataset A focuses on basic function plotting (e.g., z = x\u00b2 + y\u00b2).",
        "Tasks in dataset B involve material-specific physical properties under constrained conditions (e.g., resistivity at 20\u00b0C), while dataset A queries general properties (e.g., molecular structure).",
        "Dataset B includes constraints on mathematical regions (e.g., inner pentagram inequalities), which are absent in dataset A tasks.",
        "Dataset B tasks combine multiple domains in a single query (e.g., physics + health for sunburn time), while dataset A tasks remain domain-specific (e.g., pure chemistry or finance).",
        "Dataset B requires precise Unicode character identification within numerical ranges (e.g., 8900\u20138920), whereas dataset A involves broader linguistic exploration (e.g., word etymology)."
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=3": [
        "Dataset B tasks require solving parametric equations or plotting parametric curves (e.g., Einstein curve equations)",
        "Dataset B queries involve multi-variable physics/engineering scenarios (e.g., spring pendulum with multiple initial conditions)",
        "Dataset B tasks require working with material science specifications (e.g., UNS alloy designations)",
        "Dataset B includes Unicode character analysis within specific numeric ranges",
        "Dataset B tasks demand geometric transformation operations (e.g., rotated coordinate systems)",
        "Dataset B requires precise chemical stoichiometry calculations (e.g., percentage composition by element weight)",
        "Dataset B contains queries about real-time geophysical data (e.g., geomagnetic field on specific dates)",
        "Dataset B tasks involve comparative nutritional analysis with assumed parameters (e.g., standardized food portions)",
        "Dataset B includes applied thermodynamics calculations (e.g., sun exposure time based on SPF and skin type)",
        "Dataset B requires manipulation of polynomial expressions for simplification goals"
      ],
      "nnetnav_live_site=wolframalpha_num_tasks=66_portion=2": [
        "Tasks in B require multi-step compositional analysis (e.g., unit conversion followed by percentage breakdown)",
        "Tasks in B involve dynamic system modeling with initial physical conditions (e.g., spring pendulum parameters)",
        "Tasks in B combine multiple comparative metrics (e.g., planetary mass + rotational period) in single queries",
        "Tasks in B require parametric equation visualization (e.g., specialized curve plotting)",
        "Tasks in B demand algebraic expression simplification for optimization",
        "Tasks in B require precise temporal/spatial data retrieval (e.g., geomagnetic field at specific date/location)",
        "Tasks in B integrate numerical results with structural visualization constraints (e.g., rotated coordinate systems)",
        "Tasks in B involve Unicode character pattern matching within specified numeric ranges",
        "Tasks in B necessitate multi-source nutritional data aggregation for food comparisons",
        "Tasks in B require geometric constraint analysis for complex shapes (e.g., inner regions of polygons)"
      ]
    },
    "allrecipes": {
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=0": [
        "Tasks in B require explicit listing of nutrition facts (e.g., carbohydrate content, calories) in responses, while A does not.",
        "All B tasks specify numerical thresholds for reviews/ratings (e.g., 'at least 50 reviews'), whereas A includes tasks without explicit quantity requirements.",
        "B tasks demand structured outputs with multiple components (e.g., ingredients + steps + nutrition), while A tasks focus on single-goal retrieval.",
        "Every B task requires identifying specific quantitative metrics (e.g., '4.5 stars', 'under 30 minutes'), whereas A allows qualitative criteria like 'kid-friendly' without numerical bounds.",
        "B tasks explicitly require outputting preparation/cooking times in responses, which is absent in A's task requirements.",
        "All B samples include direct instructions to extract recipe metadata (e.g., 'primary seasoning used'), while A tasks focus on general discovery without metadata extraction.",
        "B tasks systematically request serving size information, whereas A tasks never mention serving quantities.",
        "100% of B tasks require validation against star ratings (e.g., '4.5 or higher'), while A includes tasks without rating constraints.",
        "Storage instructions are explicitly required in B tasks when applicable, a requirement absent in A's tasks.",
        "B tasks universally specify output formatting requirements (e.g., 'list 3 recipes', 'include nutritional information'), while A tasks lack formatting directives."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=4": [
        "Tasks in B explicitly require providing specific nutritional values (e.g., carbs, calories) rather than general metadata",
        "B tasks specify exact numerical thresholds for review counts (e.g., 'at least 50 reviews') rather than general popularity",
        "B requires outputting structured recipe components (ingredient lists, preparation steps) as mandatory elements",
        "B includes food storage/preservation instructions as part of task requirements",
        "B tasks demand identification of specific recipe attributes (main seasoning, vegetable varieties) in responses",
        "B contains requests for multiple recipe recommendations within single tasks (e.g., 'list 3 recipes')",
        "B uses precise time constraints (e.g., 'under 1 hour') rather than general time indicators",
        "B specifies exact serving size requirements (e.g., 'suitable for 6 people') in task parameters",
        "B includes historical/collection-based recipe searches (e.g., '1960s recipes') as distinct task types",
        "B requires brief preparation step summaries rather than just locating/saving recipes"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=1": [
        "Dataset B tasks require structured data extraction (e.g., ingredient lists, cooking steps) while A focuses on exploratory navigation (e.g., browsing categories).",
        "Tasks in B mandate explicit output formatting (e.g., 'Provide the nutritional information per serving'), unlike A's open-ended exploration.",
        "B specifies exact numerical thresholds for recipe metadata (e.g., 'under 30 minutes', 'over 500 reviews') in all queries, while A uses broader thresholds (e.g., '50+ reviews').",
        "All B tasks require reporting quantitative nutrition metrics (e.g., 'total carbohydrate content'), whereas A only references macros generally.",
        "B systematically requests preparation/cook time parameters as standalone outputs, while A treats them as filtering criteria.",
        "Tasks in B explicitly demand ingredient enumeration in responses, which A only implies through filtering requirements.",
        "Dataset B includes storage/leftover instructions in task requirements (e.g., 'how to store these rolls'), absent in A's tasks.",
        "All B tasks require validation of recipe scale (e.g., 'suitable for 6 people'), while A focuses on personal saving/bookmarking actions.",
        "B tasks specify output organization (e.g., 'list 3 recommended recipes'), whereas A tasks involve open-ended comparison without structured reporting.",
        "Dataset B requires explicit citation of recipe provenance (e.g., 'second recipe in this collection'), while A tasks focus on general discovery."
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=3": [
        "Tasks in B require specific quantitative criteria (e.g., 'at least 50 reviews', '4 stars or higher') while A focuses on general categorical requirements",
        "B includes explicit requests for nutritional data output (e.g., 'show Nutrition Facts') whereas A only references nutritional information as search criteria",
        "B tasks demand multi-component responses (ingredient lists + cooking times + steps) while A focuses on single-action objectives like finding/saving recipes",
        "B requires parsing and reporting numerical metrics (review counts, star ratings) as part of task completion unlike A's qualitative usage of reviews",
        "B contains complex filtering combinations (e.g., 'vegetarian + under 30 mins + specific ingredients') versus A's simpler single-filter requests",
        "B tasks specify output formatting requirements (e.g., 'list 3 recipes', 'note the vegetables') while A focuses on basic information retrieval",
        "B includes preservation/storage instructions as task components ('how to store these rolls') not present in A's requirements",
        "B emphasizes exact timing constraints ('prep time under 1 hour') versus A's general temporal references like 'weekly meal prep'",
        "B requires explicit identification of recipe metadata (cook time, servings) rather than implicit usage seen in A's tasks",
        "B features precise dietary specifications (vegan, high-protein) while A uses broader categories (low-carb, keto)"
      ],
      "nnetnav_live_site=allrecipes_num_tasks=79_portion=2": [
        "Tasks in dataset B specify exact numerical thresholds for review counts (e.g., 50+ reviews) while A uses general minimum thresholds",
        "Dataset B requires explicit inclusion of nutritional facts per serving (e.g., carb content, calories) in task outputs",
        "Tasks in B frequently demand 4.5-star rating thresholds compared to A's 4-star baseline",
        "Dataset B tasks include strict time constraints (e.g., 'under 30 minutes') more systematically than A",
        "B's tasks require outputting full ingredient lists and preparation steps as mandatory components",
        "Dataset B emphasizes specific dietary labels (e.g., vegan, low-carb) in search criteria more consistently",
        "Tasks in B combine 3+ attributes simultaneously (ratings+reviews+nutrition+time) more frequently than A",
        "Dataset B specifies exact serving quantities (e.g., 'suitable for 6 people') as a required output element",
        "B's tasks require identification of specific vegetable varieties/ingredients (e.g., zucchini) within recipe parameters",
        "Dataset B includes multi-recipe comparison tasks (e.g., 'list 3 recommended dinners') absent in A's samples"
      ]
    },
    "dictionary.cambridge": {
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=2": [
        "Dataset B tasks require changing the website's interface language (e.g., switching to Deutsch), while Dataset A focuses on translating words between languages.",
        "Dataset B includes tasks requiring users to generate or provide multiple example sentences illustrating different contextual uses of words.",
        "Dataset B tasks involve applying grammatical rules (e.g., converting direct to indirect speech), whereas Dataset A focuses on locating existing grammatical explanations.",
        "Dataset B's Word Scramble tasks explicitly require timed interactions (e.g., 'beat the clock'), while Dataset A tasks only involve accessing the game.",
        "Dataset B tasks use specialized or advanced vocabulary (e.g., 'cryptocurrency,' 'zeitgeist'), whereas Dataset A uses more common terms (e.g., 'hello,' 'jukebox').",
        "Dataset B tasks require step-by-step application of grammatical structures (e.g., forming comparatives), while Dataset A focuses on identifying definitions or rules.",
        "Dataset B tasks often demand two example sentences per word to demonstrate varied contexts, whereas Dataset A typically asks for a single example.",
        "Dataset B emphasizes procedural navigation (e.g., 'how to change direct speech to indirect'), while Dataset A focuses on declarative knowledge retrieval.",
        "Dataset B tasks include explicit instructions to interact with dynamic elements (e.g., timed games), whereas Dataset A tasks involve static content interaction.",
        "Dataset B requires explicit comparison of grammatical usage with countable/uncountable nouns, while Dataset A tasks compare dictionary variants (e.g., learner\u2019s vs. essential)."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=3": [
        "Tasks in dataset B require providing example sentences illustrating word usage in multiple contexts, while dataset A typically requests single examples.",
        "Dataset B tasks involve translating abstract or specialized terms (e.g., 'nostalgia' to Chinese), whereas dataset A focuses on concrete vocabulary translations (e.g., days of week).",
        "Grammar tasks in dataset B demand explicit rule explanations with transformation examples (e.g., direct\u2192indirect speech), while dataset A focuses on identifying grammatical categories.",
        "Dataset B consistently requires both UK/US pronunciation reporting, while dataset A sometimes asks for single variants.",
        "Tasks in dataset B specify interactive game challenges with step completion requirements (e.g., 'try first example'), unlike dataset A's general game mentions.",
        "Dataset B emphasizes morphological analysis through tasks like word formation patterns (e.g., '-washing' in 'healthwashing'), absent in dataset A.",
        "Translation tasks in dataset B involve bidirectional language pairs beyond English-European languages featured in dataset A.",
        "Dataset B includes tasks requiring explicit contextual differentiation (e.g., 'use in two different contexts'), while dataset A focuses on basic usage examples.",
        "Tasks in dataset B target advanced vocabulary acquisition (e.g., 'quintessential', 'zeitgeist'), contrasting with dataset A's emphasis on common words.",
        "Dataset B requires detailed grammatical feature comparisons (e.g., articles with countable/uncountable nouns), whereas dataset A addresses broader grammar categories."
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=1": [
        "Tasks in B require changing the website's interface language (e.g., to German), while A does not",
        "B includes tasks requiring interaction with 'Popular searches' sections for trending vocabulary terms",
        "B's grammar tasks focus on specific structures (passive voice, indirect speech) rather than general grammar exploration in A",
        "B tasks demand using the Thesaurus for phrase-based synonyms (e.g., 'to behave well'), while A focuses on single-word synonyms",
        "B's Word Scramble tasks require explicit game step execution (e.g., 'try the first example'), unlike A's general exploration",
        "B tasks specify providing example sentences in multiple contextual scenarios, while A typically requires single examples",
        "B's blog-related tasks emphasize language usage nuances (e.g., 'ways to say gradually'), whereas A's involve general language topics",
        "B includes translation tasks involving interface language switching (e.g., English to Deutsch), not just word translation like A",
        "B tasks involve newer/more specialized vocabulary (e.g., 'healthwashing'), while A uses established terms",
        "B requires detailed grammar rule application (e.g., articles with countable/uncountable nouns), while A focuses on basic grammar navigation"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=4": [
        "Dataset B tasks require converting the entire homepage interface language (e.g., to Deutsch), while A tasks only adjust dictionary language direction settings",
        "B tasks explicitly require using words/phrases in multiple example sentences illustrating different contexts, whereas A tasks only request example sentences without contextual variation",
        "B tasks involve detailed grammar guideline navigation (e.g., transforming direct to indirect speech), while A tasks focus on general grammar rule lookups",
        "B tasks specify accessing word games through the '+Plus' section explicitly, while A tasks reference games without section-specific navigation",
        "B tasks consistently require both UK/US pronunciations for single entries, while A tasks may request only one regional variant per task",
        "B tasks include specific grammatical concept exploration (e.g., articles with countable/uncountable nouns), while A tasks address broader grammatical categories",
        "B tasks demand linguistic transformation demonstrations (e.g., direct\u2192indirect speech conversion), while A tasks focus on static rule explanations",
        "B tasks require interface language conversion (whole site localization), while A tasks involve functional language pair adjustments for translations",
        "B tasks emphasize contextual application through varied example sentence generation, while A tasks focus on basic example extraction",
        "B tasks specify navigation through hierarchical grammar category structures, while A tasks reference general grammar guide access"
      ],
      "nnetnav_live_site=dictionary.cambridge_num_tasks=54_portion=0": [
        "Dataset A includes tasks requiring interaction with social media sharing features (e.g., sharing definitions on Facebook/X), while Dataset B does not.",
        "Dataset A tasks involve accessing blog posts (e.g., 'Cheap as chips: talking about low prices'), whereas Dataset B tasks lack blog-related navigation.",
        "Dataset A tasks reference structured vocabulary categories like SMART Vocabulary, absent in Dataset B.",
        "Dataset B tasks explicitly require two example sentences per word for contextual usage, while Dataset A typically requests one.",
        "Dataset B tasks focus on specific grammatical structures (e.g., passive voice, articles), whereas Dataset A tasks address broader grammatical concepts (e.g., adjectives, adverbs).",
        "Dataset B tasks require switching the website's interface language (e.g., English to Deutsch), while Dataset A tasks focus only on dictionary language pairs (e.g., English\u2013French).",
        "Dataset A tasks include references to annual 'Word of the Year' features, absent in Dataset B.",
        "Dataset B tasks in the Plus section specify interacting with the first example of games (e.g., Word Scramble), while Dataset A tasks lack such granularity.",
        "Dataset A includes translation tasks to less common languages (e.g., Portuguese), while Dataset B focuses on widely spoken languages (e.g., Chinese, Spanish).",
        "Dataset B tasks demand detailed grammatical rule explanations (e.g., indirect speech transformations), while Dataset A tasks emphasize general rule comprehension."
      ]
    },
    "apple": {
      "nnetnav_live_site=apple_num_tasks=70_portion=1": [
        "Tasks in B require identifying exact numerical specifications (e.g., video recording resolution, product dimensions)",
        "B includes tasks targeting color availability comparisons across multiple product generations",
        "B contains explicit requests for release date timelines and regional availability details",
        "Tasks in B specify precise configuration requirements (e.g., '256GB storage in Purple color')",
        "B requires identification of incremental hardware updates across sequential product versions",
        "Tasks in B focus on technical measurement comparisons (e.g., weight, screen size differences)",
        "B includes price difference calculations between base models and maximum upgrades",
        "Tasks in B require verification of compatibility with specific iOS versions",
        "B contains requests to identify visual content in service sections (e.g., artist names in Apple Music)",
        "Tasks in B specify exact calendar dates for product availability checks"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=4": [
        "Tasks in B require identifying specific technical measurements (e.g., dimensions, weight, resolution)",
        "B includes explicit requests for software version compatibility checks (e.g., iOS compatibility)",
        "Tasks in B require direct comparisons between 3+ product generations/models simultaneously",
        "B contains queries requiring mathematical calculations of upgrade costs/price differences",
        "Tasks in B specify exact purchase configuration requirements (storage capacity, color availability)",
        "B includes requests for precise release timelines and regional availability dates",
        "Tasks in B focus on accessory technical specifications (e.g., Siri Remote features)",
        "B requires identification of specific product variants within a single model line",
        "Tasks in B demand verification of in-store inventory/pickup scheduling functionality",
        "B includes explicit requests to locate visual content analysis (e.g., identifying artists in images)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=0": [
        "Tasks in B require locating precise technical specifications (e.g., video resolution, weight, dimensions) not emphasized in A",
        "B includes tasks targeting software version compatibility (e.g., iOS 17 with iPhone 12) absent in A",
        "B focuses on quantifying exact upgrade costs (e.g., 'calculate total price difference for MacBook Pro upgrades') while A focuses on general comparisons",
        "B requires identifying specific release timelines and regional availability details more granularly than A",
        "Tasks in B demand direct price calculations between storage/configuration tiers (e.g., 256GB vs. base models) unlike A's simpler price checks",
        "B contains explicit requests for accessory specifications (e.g., Siri Remote features) not present in A's tasks",
        "B includes time-bound actions (e.g., 'schedule in-store pickup for specific dates') absent in A's navigation patterns",
        "Tasks in B require verifying current software/firmware update availability (e.g., Apple Watch updates) not seen in A",
        "B emphasizes exact color variant identification across multiple product generations (e.g., iPhone 13-15 Pro colors) more than A",
        "B contains mathematical comparisons (e.g., 'how many types of AirPods with price difference') requiring quantitative analysis beyond A's qualitative comparisons"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=2": [
        "Dataset B tasks emphasize locating exact technical specifications like video recording resolutions and sensor details",
        "Dataset B includes explicit requests for product color variant comparisons across multiple generations",
        "Dataset B tasks require identifying physical dimensions/weight metrics for hardware components",
        "Dataset B contains specific queries about regional availability and release timelines for unreleased products",
        "Dataset B tasks focus more on cross-generational comparisons (e.g. iPhone 13 Pro vs 14 Pro vs 15 Pro)",
        "Dataset B requires checking compatibility between new software updates and older hardware models",
        "Dataset B includes precise queries about accessory specifications (e.g. Siri Remote features)",
        "Dataset B tasks emphasize base model configurations rather than customization workflows",
        "Dataset B contains explicit requests for condition-specific trade-in value calculations",
        "Dataset B features time-bound availability checks (e.g. specific pickup dates)"
      ],
      "nnetnav_live_site=apple_num_tasks=70_portion=3": [
        "Tasks in B require locating precise technical specifications (e.g., video recording resolution, exact weight measurements) rather than general feature descriptions.",
        "B emphasizes identifying exact regional launch timelines and phased availability details for unreleased products.",
        "Tasks in B involve comparing specific hardware attributes (e.g., screen sizes, remote features) across 3+ product generations rather than 2-generation comparisons.",
        "B requires calculating incremental pricing differences between multiple upgrade tiers (e.g., base vs. max configurations with custom components).",
        "Tasks in B focus on verifying conditional trade-in values (e.g., device condition assessments) rather than general trade-in estimates.",
        "B includes time-bound availability checks (e.g., specific launch dates, in-store pickup scheduling for concrete dates).",
        "Tasks in B demand identification of newly introduced peripheral features (e.g., Siri Remote capabilities) rather than core device functionalities.",
        "B requires cross-referencing software version compatibility with specific legacy devices (e.g., iOS 17 support for iPhone 12).",
        "Tasks in B involve counting distinct product variants (e.g., AirPods types) and calculating their price differentials.",
        "B emphasizes verifying quantitative performance metrics (e.g., battery runtime measurements, resolution statistics) over qualitative feature descriptions."
      ]
    },
    "google_search": {
      "nnetnav_live_site=google_search_num_tasks=72_portion=3": [
        "Tasks in dataset B require extracting precise numerical or alphanumerical data points (e.g., SHA commits, planetary distances, statistical counts) whereas dataset A focuses on general information retrieval (e.g., prices, news summaries).",
        "Dataset B tasks often involve retrieving metadata from developer or technical platforms (e.g., GitHub commits, software compatibility matrices) while dataset A emphasizes consumer platforms (e.g., job boards, recipe sites).",
        "Tasks in dataset B explicitly demand chronological precision (e.g., 'latest commit today', 'last week's airport data') whereas dataset A uses relative timeframes (e.g., 'recent news', 'latest releases').",
        "Dataset B contains astronomy/astrophysics queries requiring dynamic celestial calculations (e.g., current Earth-Mars distance) absent from dataset A's domain coverage.",
        "Tasks in dataset B frequently require aggregation of ranked lists with cardinal constraints (e.g., 'top 3 planets', 'top 5 movies') while dataset A comparisons are open-ended (e.g., 'compare recipes').",
        "Dataset B includes platform-specific community metrics (e.g., Reddit member counts, trending topics) whereas dataset A focuses on institutional sources (e.g., government data, academic papers).",
        "Tasks in dataset B require parsing structured technical specifications (e.g., 'AirDrop web transmission requirements') unlike dataset A's general technical compatibility queries.",
        "Dataset B emphasizes exact event temporal parameters (e.g., 'year before last', 'most recent World Cup') while dataset A uses vague temporal markers (e.g., 'current', 'recent').",
        "Tasks in dataset B target academic/scientific milestones (e.g., Nobel Prize contributions) as discrete retrieval targets, whereas dataset A handles medical/health information as exploratory research.",
        "Dataset B contains explicit cross-platform data synthesis requirements (e.g., comparing IMDb/Rotten Tomatoes ratings) absent from dataset A's single-source comparisons."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=2": [
        "Dataset B tasks require retrieving exact technical specifications (e.g., hardware/software requirements), while A focuses on general consumer information (e.g., recipes, event venues)",
        "B emphasizes extracting precise identifiers (e.g., SHA hashes, version numbers), whereas A prioritizes qualitative data (e.g., health advice, user reviews)",
        "Tasks in B frequently involve interacting with developer platforms (e.g., GitHub repositories), while A focuses on mainstream services (e.g., Google products, Wikipedia)",
        "B includes astronomical/scientific data retrieval (e.g., planetary distances, star systems), absent in A's task scope",
        "Tasks in B demand parsing structured technical documentation (e.g., API requirements), while A emphasizes parsing user-generated content (e.g., recipe blogs)",
        "B contains explicit requests for ranked/ordered lists (e.g., 'top-10', 'top 3'), whereas A's rankings are implicit (e.g., 'best foods')",
        "Tasks in B require knowledge of version control systems (e.g., commit SHAs), while A assumes familiarity with consumer apps (e.g., Duolingo)",
        "B focuses on real-time quantitative data (e.g., 'today's distance', 'latest commit'), while A's temporal requirements are broader (e.g., 'latest news')",
        "Tasks in B often require cross-referencing technical platforms (e.g., FlightAware, developer blogs), while A uses conventional sources (e.g., news outlets, recipe sites)",
        "B emphasizes exact numeric comparisons (e.g., 'how many in Asian'), whereas A uses relative comparisons (e.g., 'compare stock prices')"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=4": [
        "Dataset A tasks focus on exploratory research and trend discovery, while Dataset B requires precise factual retrieval with specific output formats (e.g., lists, exact values)",
        "Dataset B tasks frequently involve numerical quantification requirements (e.g., 'top-10', 'how many', 'total arrivals') absent in Dataset A",
        "Dataset A contains tasks requiring content contribution/editing (Wikipedia edits, recipe database additions) not present in Dataset B",
        "Dataset B emphasizes real-time temporal precision (\"as of today's date\", \"latest commit\") while Dataset A focuses on general recency without strict timestamp requirements",
        "Dataset A includes practical transaction tasks (job applications, ticket purchases) absent in Dataset B's purely informational focus",
        "Dataset B tasks require structured output generation (ranked lists, comparative tables) as explicit requirements rather than implicit analysis",
        "Dataset A contains multi-domain health advisory tasks (medical prevention, nutrition) while Dataset B focuses on technical/statistical queries",
        "Dataset B tasks frequently involve software/technical specification retrieval (hardware requirements, version compatibility) as primary objectives",
        "Dataset A requires interpretation of abstract concepts (AI relevance, fashion trends) while Dataset B focuses on concrete entity identification",
        "Dataset B tasks explicitly require data manipulation/transfer (copy-paste SHA values) not present in Dataset A's observational tasks"
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=0": [
        "Tasks in dataset B require interacting with specialized platforms or repositories (e.g., GitHub, FlightAware, IMDb) to retrieve data, while dataset A focuses on general websites and services.",
        "Dataset B tasks frequently involve retrieving dynamic or real-time data (e.g., distance between celestial bodies as of today, latest commit SHA), whereas dataset A emphasizes static or semi-static information (e.g., prices, venue availability).",
        "Tasks in dataset B often demand parsing structured technical specifications (e.g., hardware/software requirements) or scientific data, unlike dataset A\u2019s focus on consumer-oriented or lifestyle information.",
        "Dataset B includes tasks requiring extraction of ranked or sorted lists (e.g., top 5 movies by ratings, top 3 super-earth planets), while dataset A tasks prioritize singular factual answers.",
        "Tasks in dataset B frequently involve cross-referencing or comparing data across multiple authoritative platforms (e.g., IMDb vs. Rotten Tomatoes), whereas dataset A relies on single-source validation.",
        "Dataset B tasks focus on scientific, astronomical, or technical subject matter (e.g., exoplanets, software requirements), while dataset A emphasizes practical, everyday needs (e.g., recipes, event planning).",
        "Tasks in dataset B often require precise timestamps or version-specific data (e.g., latest software version compatibility), whereas dataset A prioritizes general time sensitivity (e.g., latest news, recent prices).",
        "Dataset B includes tasks targeting community-driven metrics (e.g., Reddit member counts, trending topics), while dataset A focuses on institutional or commercial data (e.g., hotel bookings, medical guidelines).",
        "Tasks in dataset B involve parsing technical identifiers (e.g., commit SHAs, astronomical system names), whereas dataset A tasks center on human-readable content (e.g., event venues, recipes).",
        "Dataset B tasks frequently require interpreting platform-specific terminology (e.g., GitHub commits, AirDrop functionality), while dataset A uses broadly understood search terms (e.g., \u201c5-star hotel,\u201d \u201cSEO tools\u201d)."
      ],
      "nnetnav_live_site=google_search_num_tasks=72_portion=1": [
        "Dataset B tasks require interaction with developer tools/version control systems (e.g., GitHub commit SHAs)",
        "Dataset B emphasizes retrieving exact numerical values/measurements (e.g., astronomical distances, system requirements)",
        "Dataset B contains tasks requiring identification of technical specifications (hardware/software requirements)",
        "Dataset B includes explicit requests for sorted/ranked lists (e.g., 'top 3', 'top 5', 'top-10')",
        "Dataset B tasks frequently require working with version-specific information (e.g., 'latest version')",
        "Dataset B emphasizes temporal precision with phrases like 'as of today's date' and 'most recent'",
        "Dataset B contains tasks requiring data aggregation from specialized tracking platforms (e.g., FlightAware)",
        "Dataset B includes explicit instructions for data manipulation (copy/paste operations)",
        "Dataset B tasks focus on astronomical/planetary system information retrieval",
        "Dataset B requires identification of community metrics (e.g., member counts, engagement statistics)"
      ]
    }
  }
}