{
  "sims": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Queries frequently involve filtering courses based on their level (e.g., basic, medium, Level_500).",
        "Both datasets include questions about professors' employment status (e.g., faculty vs. non-faculty).",
        "Questions often request counts or aggregations (e.g., total courses, students, averages).",
        "Queries focus on professor-student relationships, particularly advising (e.g., advisor IDs linked to students).",
        "Both datasets require joining professor data with course data (e.g., courses taught by specific professors).",
        "Questions use numerical constraints (e.g., more than 2 years, fewer than 3 courses).",
        "Queries reference specific IDs (e.g., professor IDs, student IDs, course IDs) for precise filtering.",
        "Both include comparisons between categories (e.g., faculty vs. non-faculty, basic vs. medium courses).",
        "Questions frequently filter results by student or professor phases (e.g., pre-qualification, years in program).",
        "Queries often ask for ranked results (e.g., top professors, courses with the most instructors)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Queries focus on professors identified by specific numerical IDs.",
        "Questions involve retrieving courses taught by specific professors.",
        "Both datasets use course levels as a filtering criterion (e.g., 'basic/medium/high-level' in A vs. 'Level_500' in B).",
        "Queries require counting instances (e.g., courses taught, students advised).",
        "Questions include advisor-student relationships tied to numerical IDs.",
        "Both datasets filter results based on professor employment status (e.g., faculty/non-faculty in A vs. program membership in B).",
        "Queries request course IDs and professor IDs in paired responses.",
        "Questions target professors' teaching workloads (e.g., 'how many courses taught').",
        "Both datasets include numerical comparisons (e.g., course counts, student counts).",
        "Queries involve exact ID-based lookups for professors and courses in all samples."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets query courses taught by professors with specific conditions (e.g., faculty status, course level).",
        "Questions in both involve filtering by course level (e.g., undergraduate, medium, high-level).",
        "Both reference faculty or position status of professors (e.g., 'faculty member,' 'faculty affiliated position').",
        "Advisor-student relationships are queried in both (e.g., students advised by specific professors).",
        "Both use numerical thresholds (e.g., 'more than 4 people,' 'at least two professors').",
        "Questions in both reference course IDs and professor IDs for granular filtering.",
        "Both include queries about student phases or program years (e.g., '5th year,' 'pre-phase of qualification').",
        "Aggregate functions (e.g., count, sum, average) are used in queries across both datasets.",
        "Both datasets ask about professors teaching courses of specific difficulty levels (e.g., basic, medium, high-level).",
        "Queries in both involve conditional logic (e.g., 'AND/OR' operators) to combine criteria like course level and professor status."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets include questions about the number of courses taught by professors (e.g., 'how many courses' or 'total number of courses').",
        "Both explicitly reference course levels (e.g., 'basic or medium undergraduate courses' in A and 'level 500 or higher' in B).",
        "Both use specific entity IDs (e.g., professor IDs, course IDs, student IDs) to filter or retrieve data.",
        "Both involve counting or aggregating numerical values (e.g., 'how many,' 'sum,' 'average,' 'total number').",
        "Both ask about professors' attributes (e.g., faculty status in A and years in the program in B).",
        "Both include queries about student enrollment phases (e.g., 'pre-phase of qualification' in A and 'phase 2' in B).",
        "Both focus on filtering courses by difficulty or level categories (e.g., 'high-level or harder' in A vs. 'graduate level' in B).",
        "Both ask about advisor-student relationships (e.g., 'advisors of student ID 303' in A and 'advised by professors' in B).",
        "Both require listing specific course or professor IDs based on conditions (e.g., 'list course IDs' in both datasets).",
        "Both include questions about the number of courses per professor (e.g., 'professor teaches most courses' in A vs. 'courses taught by each professor' in B)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query specific entities using exact IDs (e.g., course_id, professor_id, student_id).",
        "Questions frequently involve retrieving or filtering by course level (e.g., basic, medium, high-level).",
        "Both include requests to identify professors associated with specific courses or students.",
        "Advisor-student relationships are a common focus (e.g., students advised by a professor).",
        "Aggregation operations like counting courses, students, or professors are present in both datasets.",
        "Attributes like professor position status (e.g., faculty member) or program duration (e.g., years in program) are queried.",
        "Course difficulty or classification (e.g., undergraduate, master) is used as a filtering criterion.",
        "Student program phases (e.g., year of study, qualification phase) are referenced in queries.",
        "Existence checks (e.g., professors who are also students, non-faculty members) appear in both datasets.",
        "Queries often use compound conditions (e.g., 'basic or medium undergraduate courses,' 'taught by person with id X')."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query relationships between professors and the courses they teach.",
        "Both focus on retrieving specific course levels (e.g., basic, medium, Level_400, Level_500).",
        "Questions in both datasets filter results using exact numerical IDs for professors, courses, or students.",
        "Both include queries about professors' employment status or roles (e.g., faculty members, non-faculty).",
        "Both datasets ask for lists of professors or courses meeting specific criteria (e.g., teaching \u22653 courses, specific course levels).",
        "Questions in both sets involve direct mappings between professors and their taught courses (e.g., 'taught by professor X').",
        "Both reference hierarchical course classifications (e.g., undergraduate, master, professional levels).",
        "Queries in both datasets use exact string matching for attributes like courseLevel (e.g., 'Level_400').",
        "Both include questions about professor-student advisory relationships (e.g., advisor IDs, student status).",
        "Both datasets require joins across entities (e.g., professors \u2194 courses \u2194 students) to answer questions."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets query courses taught by professors with specific attributes (e.g., faculty status in A, years in program in B).",
        "Questions in both datasets filter results based on course levels (e.g., 'undergraduate,' 'master,' or hierarchical levels like 'basic' or 'high').",
        "Both include queries about professor IDs and course IDs to identify specific entities.",
        "Numerical thresholds are used in filtering (e.g., 'more than 4 people' in A, 'at least 5 years' in B).",
        "Aggregation functions (e.g., count, average, sum) are employed to quantify results in both datasets.",
        "Queries in both datasets combine multiple criteria (e.g., course level + faculty status in A, years in program + course level in B).",
        "Both datasets focus on professors' roles or statuses (e.g., faculty membership in A, years in program in B).",
        "Questions in both datasets request lists of professors or courses matching specific conditions.",
        "Hierarchical course classifications (e.g., 'basic,' 'medium,' 'high') are used as filters in both datasets.",
        "Both datasets involve relational logic (e.g., professors teaching courses, advisors linked to students or courses)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets focus on querying counts of entities (courses, students, professors).",
        "Both involve filtering results by course levels (e.g., undergraduate, graduate, advanced).",
        "Queries in both datasets frequently reference specific IDs (professor IDs, course IDs, student IDs).",
        "Both include questions about faculty/position statuses of professors (e.g., faculty membership).",
        "Questions in both datasets relate professors to the courses they teach.",
        "Both use aggregation functions (e.g., SUM, COUNT, AVG) in analytical queries.",
        "Queries in both datasets filter by student phases or academic years (e.g., 'pre-phase', '5th year').",
        "Both ask for lists of professors teaching specific course levels or types (e.g., high-level undergraduate courses).",
        "Questions in both datasets involve advisor-student relationships (e.g., advisor IDs linked to students).",
        "Both include comparisons of course attributes (e.g., difficulty level, number of professors per course)."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query user-created movie lists and their metadata (titles, followers, creation dates)",
        "Both require filtering results based on user subscription status (trialist/subscriber) or payment method",
        "Both involve aggregate calculations (average ratings, counts, percentages) across user/movie data",
        "Both reference specific movie attributes: directors, release years, titles, and popularity metrics",
        "Both use numerical thresholds for filtering (>5 followers, <50 movies, >200 followers, etc.)",
        "Both include time-based constraints (creation timestamps, movie release years, recent updates)",
        "Both require identification of users/directors through IDs and associated metadata (avatars, names)",
        "Both calculate percentages of entities meeting specific conditions (lists, movies, users)",
        "Both involve table joins between users, lists, movies, ratings, and directors",
        "Both contain queries seeking top-ranked items (most followers, highest ratings, most popular movies)"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Queries involve retrieving specific numerical metrics (e.g., counts, averages, percentages) from the datasets.",
        "Both datasets include questions about movie attributes such as title, release year, director, and popularity.",
        "User-generated lists are a focus, including details like list titles, creation timestamps, and follower counts.",
        "Questions frequently filter results using conditions like date ranges (e.g., \"after 2010/1/1\", \"released in 2000\").",
        "Aggregate functions (e.g., MAX, AVG, COUNT) are commonly used to identify top-ranked or extreme values (e.g., \"most popular\", \"highest rating\").",
        "User-specific identifiers (e.g., user IDs) are used to filter or retrieve personalized data (e.g., lists created by a specific user).",
        "Both datasets reference relational data structures (e.g., joining users, lists, movies, and ratings).",
        "Questions often target metadata such as URLs (e.g., user avatars, list links) and timestamps (e.g., creation/update dates).",
        "Explicit use of logical operators (e.g., \"not more than 2\", \"over 200 followers\") to constrain results.",
        "Focus on social engagement metrics (e.g., followers, likes, ratings) tied to users or content."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries reference user attributes (e.g., subscriber, trialist, payment method) to filter results.",
        "Questions involve aggregation functions (e.g., average rating, count, percentage) on numeric fields like ratings or followers.",
        "Queries target movie lists, including list titles, IDs, creation/update dates, and follower counts.",
        "Questions focus on movies, including titles, directors, release years, ratings, popularity metrics, and IDs.",
        "Time-based filters (e.g., creation/update dates, recent ratings) are used to constrain results.",
        "Queries involve comparisons (e.g., 'most popular,' 'highest average,' 'top 3') to rank or limit outputs.",
        "User IDs and movie IDs are explicitly referenced to retrieve specific entities.",
        "Questions require joining user status (e.g., trial eligibility, subscription) with actions like list creation or rating.",
        "Queries include conditional logic (e.g., 'at least X followers,' 'rating score > Y') to filter datasets.",
        "Requests extract metadata like URLs (avatar, list links) or textual descriptions alongside quantitative data."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets query average rating scores for movies.",
        "Both datasets reference movie release years as a key filter or attribute.",
        "Both datasets include questions about the most popular movies based on user engagement metrics (e.g., ratings, followers).",
        "Both datasets require counting movies or users under specific conditions (e.g., release year, rating thresholds).",
        "Both datasets involve identifying directors associated with movies.",
        "Both datasets include queries about list metadata (e.g., titles, followers, creation timestamps).",
        "Both datasets use numerical thresholds (e.g., rating >4, followers >200) to filter results.",
        "Both datasets reference user engagement roles or statuses (e.g., trialists, critics) in queries.",
        "Both datasets ask for unique identifiers (e.g., user IDs, list IDs, director IDs).",
        "Both datasets require aggregating data across tables (e.g., movies, users, ratings, lists)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets involve queries about movie titles and their attributes (e.g., ratings, directors, popularity).",
        "Both reference user-generated lists and their properties (e.g., followers, creation dates, list IDs).",
        "Aggregate functions like average ratings, counts, and maximum values are used in questions from both datasets.",
        "Queries in both datasets filter results using numeric thresholds (e.g., ratings > 3, followers > 200).",
        "Both include explicit references to unique identifiers (e.g., user IDs, movie IDs, list IDs).",
        "Time-based constraints (e.g., creation dates, release years) appear in questions from both datasets.",
        "Both datasets focus on ranking metrics (e.g., \"most popular,\" \"highest rating,\" \"most followers\").",
        "Questions in both datasets target metadata about users (e.g., avatars, subscription status, follower counts).",
        "Both require comparisons between entities (e.g., \"most recent,\" \"highest amount,\" \"lowest number\").",
        "Conditional user statuses (e.g., trialists, subscribers) are used as filters in queries from both datasets."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query movies by specific titles in their questions.",
        "Both reference user IDs to retrieve or filter information about users.",
        "Both involve queries related to list titles and list IDs.",
        "Both include questions about average rating scores for movies.",
        "Both use popularity metrics (e.g., 'most popular movie') as a criterion.",
        "Both reference the number of followers associated with lists or movies.",
        "Both filter results based on user attributes (e.g., subscriber status, payment method).",
        "Both utilize aggregate functions like count, average, and max in queries.",
        "Both include time-based filters (e.g., 'recently updated,' 'last year').",
        "Both ask for specific rating scores (e.g., 4, 5) as criteria for analysis."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets query movie titles and list titles based on specific attributes like popularity, ratings, or user status.",
        "Both involve filtering results by user subscription status (e.g., trialist, subscriber, paid users).",
        "Both require aggregations such as counts, averages, and maximum values (e.g., 'most popular,' 'highest rating').",
        "Both reference time-based conditions (e.g., 'updated most recently,' 'created after 2010/1/1,' 'greater than 2021-01-01').",
        "Both include queries about user-generated lists and their properties (e.g., followers, update timestamps, movie numbers).",
        "Both use explicit identifiers like user IDs, movie IDs, and list IDs for precise data retrieval.",
        "Both focus on relationships between entities (e.g., users creating lists, movies in lists, directors linked to movies).",
        "Both incorporate conditional logic for ratings (e.g., 'rating of 4,' 'rated 5/5,' 'not more than 2').",
        "Both analyze popularity metrics (e.g., 'most popular movie,' 'number of followers').",
        "Both include structured filters for numerical thresholds (e.g., 'list_movie_number greater than 5,' 'over 200 followers')."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "All questions involve querying structured data entities (movies, users, or lists).",
        "Each question includes at least one explicit filtering condition (e.g., numerical thresholds, temporal constraints).",
        "Questions request specific attributes or aggregated results (e.g., counts, averages, titles).",
        "All questions can be translated into SQL queries using SELECT and WHERE clauses.",
        "Interrogative words (e.g., 'what,' 'how many,' 'which') define the desired output type.",
        "Answers require precise numerical, categorical, or textual data (e.g., IDs, names, counts).",
        "Conditions frequently involve logical comparisons (e.g., >, =, 'most recent,' 'highest').",
        "Entity relationships are implied (e.g., users creating lists, movies linked to directors).",
        "Focus on user engagement metrics (e.g., ratings, followers, likes) is consistent.",
        "All questions target retrieval of existing data rather than speculative or hypothetical scenarios."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets focus on querying app attributes such as ratings, installs, categories, and pricing (free/paid).",
        "Questions in both use numerical thresholds (e.g., ratings above 4.0, installs over 10,000).",
        "Aggregation functions (average, count, percentage) are commonly used in both datasets.",
        "Filtering by app category is a frequent criterion in queries across both datasets.",
        "Both datasets include requests for top N rankings (e.g., top 5 apps by rating or installs).",
        "Queries often combine multiple conditions (e.g., category, rating, installs, price).",
        "References to app content ratings (e.g., 'Teen', 'Everyone') appear in some questions in both datasets.",
        "Both datasets inquire about free apps and their specific attributes (e.g., average rating, install counts).",
        "Questions in both seek to retrieve app names based on specified criteria.",
        "Both datasets involve calculating percentages or ratios in certain queries (e.g., percentage of apps meeting criteria)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets focus on app ratings, including specific values, averages, counts, or thresholds (e.g., >4.0, 5 stars).",
        "Queries filter or group results by app categories (e.g., 'GAME', 'SOCIAL') or content ratings (e.g., 'Teen').",
        "Aggregate functions (e.g., average, count, percentage) are used to analyze rating distributions or review metrics.",
        "Numerical thresholds are applied (e.g., ratings above 4.5, apps with >10k reviews, 5,000+ installs).",
        "Specific apps are queried by name (e.g., 'Dragon Ball Legends', 'Fun Cube 2') for rating or review details.",
        "User feedback is analyzed quantitatively (B: review counts; A: sentiment polarity/subjectivity).",
        "Queries segment data by app attributes like price (free/paid), compatibility (Android versions), or update status.",
        "Results often require listing or ranking apps (e.g., 'top 5 apps', 'highest-rated apps').",
        "Questions compare metrics across categories (e.g., average rating in 'FAMILY' vs. 'GAME').",
        "Metadata like app versions, genres, and install ranges are used to refine results."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets include questions about app ratings (e.g., 'rating of 4.5 and above' in A, 'rating of 4.5 or higher' in B).",
        "Both focus on filtering apps by specific genres or categories (e.g., 'puzzle games' in A, 'action genre' in B).",
        "Both use aggregate metrics like averages, totals, or percentages (e.g., 'average sentiment polarity score' in A, 'average rating' in B).",
        "Both involve analyzing sentiment-related metrics (e.g., 'sentiment polarity' in A, 'negative sentiment than positive' in B).",
        "Both require filtering by install counts (e.g., '5,000+ installs' in A, '1,000,000 times' in B).",
        "Both ask for top-ranked apps based on reviews, ratings, or installs (e.g., 'top 5 shopping apps' in A, 'top 3 most downloaded apps' in B).",
        "Both reference app metadata like version numbers or update years (e.g., 'current version' in A, 'updated since 2020' in B).",
        "Both include questions about free apps (e.g., 'free application' in A, 'free apps in the action category' in B).",
        "Both use comparative language (e.g., 'more negative sentiment than positive' in A, 'higher rating than 4.5' in B).",
        "Both require cross-referencing app attributes (e.g., 'genre and installs' in A, 'category and installs' in B)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets focus on app ratings, including average ratings and counts of apps with specific ratings.",
        "Both involve categorization of apps by genre or category (e.g., 'Tools', 'Family', 'Games').",
        "Queries in both datasets use aggregation functions like average, count, and sum (e.g., average rating, total reviews).",
        "Both include filtering based on thresholds (e.g., rating >4.5, reviews >10,000, installs >5,000).",
        "Both datasets seek top-ranked apps (e.g., 'top 5 most reviewed apps').",
        "Questions in both reference app metadata such as names, categories, and install numbers.",
        "Both involve analysis of reviews (e.g., sentiment polarity in A, positive/neutral/negative counts in B).",
        "Queries in both datasets group data by attributes like category or genre for statistical analysis.",
        "Both require comparisons or ratios (e.g., percentage of negative reviews in A, average rating ratios in B).",
        "Questions in both datasets retrieve specific app attributes (e.g., name, rating, category) in results."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets focus on querying app ratings (specific or average).",
        "Both involve filtering apps based on specific categories (e.g., 'Family', 'Business').",
        "Questions in both datasets require counting reviews or categorizing them (e.g., positive/neutral).",
        "Both include aggregation functions like average, total, or percentage calculations.",
        "Queries in both datasets filter results using numerical thresholds (e.g., ratings >4.0, 5-star ratings).",
        "Both reference explicit app names or titles in questions (e.g., 'Dragon Ball Legends', 'Holy Quran Mehmet Emin Ay').",
        "Both datasets ask for metadata comparisons (e.g., free vs. paid apps, install numbers).",
        "Questions in both datasets target app versions or update timelines (e.g., 'current version', 'not updated since 2015').",
        "Both include queries about app genres or categories (e.g., 'Puzzle', 'DATING').",
        "Both datasets require grouping results by attributes like category, rating, or sentiment."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets involve queries about app ratings, either for specific apps (A) or average ratings across categories (B).",
        "Both datasets include questions that filter results based on app categories (e.g., Family, Lifestyle, Games).",
        "Both datasets reference content ratings (e.g., \"Teen\") as a filtering criterion.",
        "Both datasets use install counts as a threshold (e.g., \"5,000+\" in A, \"500\" in B).",
        "Both datasets contain queries about free apps and their attributes (e.g., ratings, installs).",
        "Both datasets require aggregation of data, such as averages (B) or percentages/counts (A).",
        "Both datasets include questions that combine multiple criteria (e.g., category + rating + install count).",
        "Both datasets ask for app names alongside specific metrics (e.g., ratings, categories).",
        "Both datasets focus on identifying top-performing apps (e.g., \"top 5\" in A, \"top-rated\" in B).",
        "Both datasets contextualize queries within app store platforms (e.g., Google Play Store)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets involve queries about app ratings, either for specific apps or aggregated averages.",
        "Both datasets include questions that filter or categorize apps by genres or categories (e.g., 'Game,' 'Entertainment').",
        "Both datasets reference app popularity metrics such as install counts (e.g., '5,000+ installs' in A, '10 million installs' in B).",
        "Both datasets use aggregate functions like averages, percentages, and counts (e.g., 'average rating' in B, 'percentage of free apps' in A).",
        "Both datasets ask for rankings or top apps based on criteria like ratings, reviews, or installs (e.g., 'top 5 apps' in both).",
        "Both datasets include sentiment-related queries (e.g., 'sentiment polarity' in A, 'positive reviews' in B).",
        "Both datasets filter results using rating thresholds (e.g., 'rating above 4.2' in B, 'rating 4.5 and above' in A).",
        "Both datasets incorporate price-related conditions (e.g., 'free apps' in A, 'price less than 5 dollars' in B).",
        "Both datasets reference app metadata like versions or update dates (e.g., 'not updated since 2018' in A, 'current version' in A).",
        "Both datasets use comparative terms like 'most,' 'top,' or 'highest' to identify standout apps."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets include questions about app ratings (e.g., average, highest, or threshold-based ratings).",
        "Both reference specific app names (e.g., 'Instagram,' 'Dragon Ball Legends') for targeted queries.",
        "Questions in both datasets involve aggregating data (e.g., counts, averages, percentages).",
        "Both focus on app categories or genres (e.g., 'Games,' 'Tools') as a key dimension.",
        "User reviews are analyzed in both datasets (e.g., total reviews, sentiment polarity).",
        "Both include queries about thresholds (e.g., apps with ratings >4.0, installs >5,000).",
        "Sentiment analysis (e.g., polarity, subjectivity, neutral/negative/positive classifications) is a shared feature.",
        "Top-ranked apps (e.g., 'top 5,' 'highest rated') are identified in both datasets.",
        "Both datasets require filtering apps based on metadata (e.g., update year, content rating).",
        "Comparative analysis (e.g., category vs. category, free vs. paid) is present in questions from both datasets."
      ]
    }
  },
  "diffs_synth_from_real": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries where students are referenced as teaching courses (e.g., 'student teaching'), while A focuses exclusively on professors as instructors.",
        "Dataset B explicitly requests course names (e.g., 'names of the courses'), whereas A refers to courses only by IDs and levels.",
        "Dataset B references specific academic faculties (e.g., 'Faculty of Mathematics'), while A uses generic terms like 'faculty employees' without naming departments.",
        "Dataset B contains questions asking for combined totals of professors and students (e.g., 'total number of professors and students'), which A does not.",
        "Dataset B uses exact course level labels like 'Level_200' and 'Level_500' consistently, whereas A uses qualitative terms like 'basic' or 'high-level'.",
        "Dataset B explicitly requests minimum/maximum values (e.g., 'minimum and maximum number of years'), while A focuses on averages, sums, or counts with constraints.",
        "Dataset B uses phase labels like 'Pre_Quals' and 'Phase 0', while A uses phrases like 'pre-phase of qualification' or 'years in program'.",
        "Dataset B includes negation filters (e.g., 'courses not taught by professor 240'), which A does not employ.",
        "Dataset B asks for student counts per course level (e.g., 'total number of students in each level'), while A aggregates counts without explicit level groupings.",
        "Dataset B references departmental affiliations (e.g., 'position in a department'), whereas A focuses on faculty/non-faculty status without departmental specificity."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries explicitly reference professors by ID using phrases like 'professor 335' or 'professor ID 200', whereas A uses indirect references like 'professor who advised student ID \"303\"'",
        "Dataset B includes questions about student names (e.g., 'name of the person') and personal attributes, which never appear in Dataset A",
        "Dataset B uses standardized course level labels (e.g., 'Level_500') as discrete values, while A uses relative qualitative descriptors like 'basic/medium/high-level'",
        "Dataset B contains questions about dual-role individuals ('both a student and a professor'), a concept absent in Dataset A",
        "Dataset B explicitly asks for 'the highest course level in the database', while A focuses on categorical level comparisons (basic vs medium vs high)",
        "Dataset B queries use the phrase 'in the program' for status context, whereas A uses 'faculty/non-faculty' employment status",
        "Dataset B includes direct numerical comparisons between specific courses (e.g., 'course no.16 or course no.18'), while A compares categorical course groups",
        "Dataset B asks for per-professor breakdowns of student advising counts ('each professor'), whereas A focuses on aggregate counts without individual breakdowns",
        "Dataset B contains explicit requests for 'the name of the student', while A exclusively references students by ID and never by name",
        "Dataset B uses the specific attribute 'yearsInProgram' for students, a granular metric not present in Dataset A's phase-based qualifications"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B queries use numerical course level designations (e.g., Level 300, Level_400) rather than qualitative descriptors like 'basic/medium/high-level' used in A",
        "Dataset B includes explicit references to professors' experience duration (e.g., 'more than 5 years') while A does not mention temporal experience thresholds",
        "Dataset B contains queries about professors' program phases (e.g., 'Phase 1') rather than student qualification phases referenced in A",
        "Dataset B specifies numeric ranges for professor IDs (e.g., 'greater than 10 and <=15') while A only uses exact ID matching",
        "Dataset B asks about courses taught exclusively by single professors ('only one professor teaching it'), a concept absent in A's queries",
        "Dataset B references named courses (e.g., 'Data Structures and Algorithms') while A only uses course IDs and level descriptors",
        "Dataset B queries include positional comparisons ('highest level of courses') using MAX-like logic not explicitly required in A's samples",
        "Dataset B contains explicit exclusion criteria for advice relationships ('have not been advised by any professors') while A focuses on existence of relationships",
        "Dataset B uses more varied position descriptors (e.g., 'Faculty_eme', 'full-time professors') compared to A's consistent 'faculty member' terminology",
        "Dataset B includes multi-advisor scenarios ('students with more than one advisor') while A only references single advisor relationships"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B does not include questions about faculty membership or position status of professors (e.g., 'faculty employees' or 'position in university'), which are common in A.",
        "Dataset B uses generic phase labels like 'phase 2' for student enrollment, whereas A uses more specific descriptors like 'pre-phase of qualification' or 'eighth year of program'.",
        "Dataset B avoids combining multiple aggregation operations (e.g., 'sum of year 1 and year 2 students' in A) and focuses on singular counts or totals.",
        "Dataset B does not reference direct comparisons between entity IDs (e.g., 'course no.16 or course no.18' in A) or 'top N' rankings (e.g., 'top 5 professors').",
        "Dataset B includes questions about student-to-student advising relationships (e.g., 'advised by at least one other student'), which are absent in A.",
        "Dataset B uses standardized course level categories (e.g., 'Level_100', 'graduate level') instead of qualitative terms like 'high-level or harder' in A.",
        "Dataset B omits queries about professors' positional roles (e.g., 'position of this person in the university') and focuses on their tenure (e.g., 'years in the program').",
        "Dataset B includes redundant or repeated questions (e.g., 'How many students are currently in the program?' appears multiple times), unlike A's varied phrasing.",
        "Dataset B does not require combining course difficulty with faculty status (e.g., 'courses taught by a faculty member') as seen in A.",
        "Dataset B includes queries about student enrollment counts per course (e.g., 'number of students enrolled in each course'), which are absent in A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries never reference specific academic phases like 'pre-phase of qualification' or 'qualification phase' present in Dataset A.",
        "Dataset B does not include aggregation operations involving sums or averages (e.g., 'sum of year 1 and year 2 students'), whereas Dataset A does.",
        "Dataset B queries lack explicit comparisons between entities (e.g., 'course no.16 or course no.18'), unlike Dataset A.",
        "Dataset B does not use terms like 'teacher' or 'non-faculty members'; it exclusively uses 'professor' or 'person' in attribute references.",
        "Dataset B includes direct references to professor/staff names (e.g., 'Professor Jane'), while Dataset A only uses numeric IDs and positional roles.",
        "Dataset B queries never combine course difficulty (e.g., 'high-level or harder') with program phases in compound conditions, unlike Dataset A.",
        "Dataset B does not request ranking operations (e.g., 'top 5 professors'), which are common in Dataset A.",
        "Dataset B lacks queries about students being advised to teach courses, a specific condition present in Dataset A.",
        "Dataset B does not filter by granular program duration ranges (e.g., 'more than 5 years') beyond simple existence checks.",
        "Dataset B queries omit explicit mentions of faculty employment status (e.g., 'member of faculty') as a standalone filter, focusing instead on role existence (professor/student)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in dataset B do not involve aggregations (e.g., counts, averages, sums) unlike dataset A, which frequently uses functions like 'sum,' 'average,' or 'how many.'",
        "Dataset B questions focus on retrieving direct mappings (e.g., 'Who taught course X?') without combining multiple criteria (e.g., faculty status + course level) seen in dataset A.",
        "Dataset B does not use comparative terms (e.g., 'most,' 'top 5,' 'highest number') present in dataset A queries.",
        "Dataset B lacks questions about professors' employment status (e.g., faculty/non-faculty) as a standalone filter, unlike dataset A.",
        "Dataset B does not reference student progress phases (e.g., 'pre-phase of qualification') or program years (e.g., '5th year') present in dataset A.",
        "Dataset B queries never combine course levels with employment status (e.g., 'courses taught by faculty members') as seen in dataset A.",
        "Dataset B uses only exact course levels (e.g., Level_400) without hierarchical classifications like 'undergraduate' or 'professional' used in dataset A.",
        "Dataset B does not ask for multi-part answers (e.g., 'course ID and level') and instead requests singular attributes (e.g., 'course level').",
        "Dataset B includes trivial temporal filters (e.g., 'more than 0 years') absent in dataset A, which uses more specific thresholds (e.g., '\u22653 courses').",
        "Dataset B does not require joins involving students for course/professor queries unless explicitly about advisorship, whereas dataset A often links professors, courses, and students in complex joins."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries include references to professors by name (e.g., 'professor John'), while A exclusively uses IDs or positional terms like 'teacher no.79'.",
        "Dataset B filters professors based on explicit time-based criteria (e.g., 'yearsInProgram'), whereas A focuses on faculty membership or employment status.",
        "Dataset B includes queries where students in specific program years (e.g., '2nd year') teach courses, which A does not address.",
        "Dataset B uses course levels like 'masters' and 'advanced' as standalone classifications, while A employs hierarchical terms like 'basic,' 'medium,' and 'high' often combined with degree types (e.g., 'undergraduate').",
        "Dataset B lacks complex aggregation operations (e.g., sum of student years, averages) present in A, focusing instead on counts or simple comparisons.",
        "Dataset B does not include explicit result limits (e.g., 'list any five') found in A's queries.",
        "Dataset B ties advising conditions to course levels (e.g., 'advises students in a course with a level of advanced'), while A links advising to student phases (e.g., 'pre-phase of qualification').",
        "Dataset B does not query student counts under advisors (e.g., 'How many students are under advisor 415?'), a common feature in A.",
        "Dataset B omits direct numerical comparisons between specific courses (e.g., 'course no.16 or course no.18') seen in A.",
        "Dataset B allows professors to teach courses while being in early program years (e.g., 'first year of a program'), whereas A\u2019s professors are filtered by faculty/non-faculty status without such temporal granularity."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries frequently use generic terms like 'person' or 'people' instead of specific entity names like 'professor' or 'student' used in A.",
        "Queries in B often omit explicit references to schema elements (e.g., asking 'course levels offered by the university' instead of referencing a specific table/column like A's examples).",
        "B contains redundant/repeated questions about basic relationships (e.g., multiple variations of 'Which courses are taught by professors?') without added complexity.",
        "B's queries show inconsistent casing (e.g., 'courseLevel' vs 'course level') compared to A's consistent formatting of database terms.",
        "B includes ambiguous placeholder values (e.g., 'student X') instead of concrete IDs like A's explicit references (e.g., 'student ID \"303\"').",
        "B's aggregation questions are limited to simple COUNT operations, while A uses more diverse aggregations (SUM, AVG) and ranking (top N queries).",
        "B contains schema exploration questions (e.g., 'course levels available in the database') absent in A, which assumes schema knowledge.",
        "B's filtering conditions are simpler (e.g., single WHERE clauses) compared to A's frequent use of compound conditions (AND/OR combinations).",
        "B lacks questions comparing specific entity attributes (e.g., 'course 16 vs 18') that are common in A.",
        "B includes non-specific relationship queries (e.g., 'who teaches which courses') without filters/aggregations, while A ties relationships to concrete attributes (faculty status, course levels)."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries frequently filter lists by specific substring patterns in titles (e.g., 'contains the word \"s\"') while A does not",
        "Dataset B includes explicit sorting requirements (e.g., 'descending order of followers') whereas A only implies ranking through 'top-ranked'",
        "Dataset B contains queries about users not following their own created lists, which A never references",
        "Dataset B requires identification of movies in thematic/categorical lists (e.g., 'ballet movies', 'WW2 list') while A only references generic lists",
        "Dataset B calculates metrics tied to list creators' activity patterns (e.g., 'users who created lists with >5 movies') rather than just subscription status like A",
        "Dataset B explicitly handles tie-breaking scenarios in rankings ('if multiple... list all') which A never addresses",
        "Dataset B references platform-specific user segments ('all Mubi users') while A uses generic user classifications",
        "Dataset B queries combine multiple membership statuses (e.g., 'both trialist and subscriber') whereas A only uses singular status filters",
        "Dataset B includes payment method as standalone filter criteria while A only associates it with subscription status",
        "Dataset B omits requests for user media attributes (avatars/portrait pictures) that are present in A"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B questions focus on single-part queries without multi-component requests (e.g., no combined 'indicate X and Y' structures)",
        "Dataset B explicitly references table names in queries (e.g., 'in the 'lists' table', 'ratings_users table')",
        "Dataset B contains no references to user statuses or eligibility (e.g., trialist, subscriber status)",
        "Dataset B includes more requests for unique counts (e.g., 'unique movies', 'number of users') without conditional user states",
        "Dataset B uses simpler aggregate requests without combined filter-aggregate patterns (e.g., no 'average of users who were trialists')",
        "Dataset B contains direct requests for total sums across entire datasets (e.g., 'total number of followers for all lists')",
        "Dataset B repeats identical question structures across multiple samples (e.g., 5 variations of 'movie with highest rating score')",
        "Dataset B omits requests for descriptive metadata fields (e.g., no user avatar URLs, list descriptions, or rating comments)",
        "Dataset B uses simpler logical constraints (e.g., 'more than 10' vs A's 'not more than 2 while excluding trialists')",
        "Dataset B includes explicit requests for creation date counts (e.g., 'lists created in 2015') rather than date-range comparisons"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries in B focus on list titles and IDs (e.g., 'list with title containing Avengers') rather than user-specific metadata like avatar URLs or textual descriptions (common in A).",
        "B includes explicit filtering based on list title content (e.g., 'title contains the word Avengers') while A does not reference list title text patterns.",
        "B uses 'payment method' as a filter (e.g., 'users with a payment method') whereas A uses 'subscriber' or 'trialist' statuses for similar constraints.",
        "B aggregates data based on list properties (e.g., 'lists with more than 100 followers') without tying them to user eligibility states (e.g., trialists), unlike A.",
        "B explicitly references named lists (e.g., 'Mubi's Top Lists', 'My Top 100 Films'), while A does not mention predefined list names.",
        "B queries involve movie popularity within list contexts (e.g., 'movie in a movie list by a user with both trial and paid subscription') without combining popularity with user statuses like A does.",
        "B includes multi-layered aggregation (e.g., 'director who directed at least 10 movies between 1960–1985') whereas A’s aggregation is simpler (e.g., 'average rating').",
        "B omits percentage-based calculations (e.g., 'percentage of rated movies from 2021') present in A.",
        "B focuses on movie-to-list relationships (e.g., 'movie with ID 1000 in a list') without referencing user-created metadata like list descriptions (common in A).",
        "B queries movie release years as standalone entities (e.g., 'release years with least movies'), while A ties years to user ratings (e.g., 'rated movies released in 2021')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B does not reference user engagement roles or statuses (e.g., trialists, critics) directly in filtering or aggregation criteria, unlike Dataset A.",
        "Dataset B lacks queries about user-specific metadata (e.g., avatars, subscriber eligibility during list creation, user follower counts), which are prominent in Dataset A.",
        "Dataset B does not include percentage-based calculations (e.g., percentage of rated movies in a year), which are present in Dataset A.",
        "Dataset B focuses on movie popularity metrics tied to rating counts or general popularity numbers, while Dataset A includes layered popularity criteria (e.g., likes per critic per movie).",
        "Dataset B does not use multi-condition filters combining user roles and numerical thresholds (e.g., 'users who were trialists with ratings \u22642'), unlike Dataset A.",
        "Dataset B does not query hierarchical relationships (e.g., 'highest likes per critic per movie'), which are explicit in Dataset A's structured aggregations.",
        "Dataset B does not reference list-specific attributes like follower counts, creation timestamps, or list size thresholds in combination with user roles, unlike Dataset A.",
        "Dataset B avoids questions requiring joins between user-specific actions (e.g., ratings) and list ownership metadata, which are common in Dataset A.",
        "Dataset B does not include temporal conditions tied to user status changes (e.g., 'created when eligible for trial'), unlike Dataset A.",
        "Dataset B omits queries requiring dual outputs (e.g., director name + release year + average rating) in a single question, favoring simpler single-metric requests."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries focus on retrieving single attributes (e.g., titles, counts) without requiring combined multi-part answers seen in Dataset A",
        "Dataset B questions lack explicit conditional clauses involving user subscription status timelines (e.g., 'when they were trialists') present in Dataset A",
        "Dataset B never requests URL/portrait assets (avatars, list links) that appear in Dataset A queries",
        "Dataset B avoids percentage calculations and ratio-based metrics that Dataset A explicitly requests",
        "Dataset B queries don't require joining multiple entity relationships (e.g., directors + popularity + user status) like Dataset A",
        "Dataset B questions show repetitive patterns for \"highest rating\" without Dataset A's contextual variations (e.g., combining with release dates/critic metrics)",
        "Dataset B uses simpler time constraints (before/after year) compared to Dataset A's specific date formats (YYYY/MM/DD)",
        "Dataset B never requests both IDs and names simultaneously (e.g., \"director's id\") as seen in Dataset A",
        "Dataset B lacks queries about list contents/metadata (movie counts in lists, list update dates) present in Dataset A",
        "Dataset B questions never combine numeric thresholds with status-based filters (e.g., \"<50 movies + subscriber status\") like Dataset A"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B does not include queries requesting user visual attributes (e.g., portrait pictures, avatar URLs), while Dataset A does.",
        "Dataset A contains questions involving percentage calculations (e.g., percentage of movies released in a specific year), which are absent in Dataset B.",
        "Dataset B incorporates genre or thematic filters (e.g., horror movies, Christianity-themed lists), while Dataset A does not reference genres or themes.",
        "Dataset A explicitly requests URLs or links to rating resources (e.g., 'links to the ratings'), which Dataset B does not.",
        "Dataset A includes queries requiring director IDs in responses, while Dataset B only asks for director names or movie titles.",
        "Dataset A's questions frequently request multiple distinct data points (e.g., director name, release date, average rating) in a single query, whereas Dataset B's queries focus on retrieving a single type of information (e.g., title) per question.",
        "Dataset A references the number of movies within lists as a filter (e.g., 'lists with at least 200 movies'), a feature absent in Dataset B.",
        "Dataset A includes queries about 'likes' metrics associated with critics, which Dataset B does not mention.",
        "Dataset B asks for positional elements in lists (e.g., 'first movie in the list'), while Dataset A does not reference list positions.",
        "Dataset B uses explicit numerical ranking limits (e.g., 'top 5 movies'), whereas Dataset A does not specify numerical limits in ranking queries."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Queries in B explicitly reference database table names (e.g., 'lists_users table', 'Ratings table') while A does not mention tables directly.",
        "B includes genre-specific filters (e.g., 'Horror genre') whereas A queries never filter by genre.",
        "B focuses on retrieving standalone movie titles based on singular metrics (e.g., 'highest rating') without additional contextual details like directors or release dates, which are common in A.",
        "A queries frequently request user metadata (e.g., avatar URLs, portrait pictures) while B never includes these attributes.",
        "B uses repetitive phrasing for identical intents (e.g., multiple instances of 'highest rating' questions) whereas A demonstrates more varied query structures.",
        "A includes percentage-based calculations (e.g., 'percentage of rated movies released in 2021') while B lacks statistical queries beyond counts/averages.",
        "B explicitly requests top-N rankings (e.g., 'top 3', 'top 5', 'top 10') as part of aggregation logic, which A does not use.",
        "A combines multiple aggregation types in single queries (e.g., 'highest amount of likes per critic' paired with movie counts) while B uses simpler single-metric aggregations.",
        "B includes payment-related filters (e.g., 'users with a payment method') whereas A filters exclusively by subscription status tiers like trialist/subscriber.",
        "A queries incorporate temporal relationships between user actions and subscription status (e.g., 'created when eligible for trial') while B only uses static status filters."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B questions focus on singular attributes (e.g., title, count) without combining multiple distinct attributes in a single query, unlike Dataset A which frequently requests combined outputs (e.g., director name + release date + average rating).",
        "Dataset B uses simpler aggregation phrasing (e.g., 'highest rating score') without nested qualifiers like 'most popular movie of all time' or 'users who were on a trialist' seen in Dataset A.",
        "Dataset B questions lack temporal constraints involving user status changes (e.g., 'when they were a subscriber/trialist') present in 40% of Dataset A questions.",
        "Dataset B queries never request percentage calculations (e.g., 'percentage of rated movies') or ratio-based metrics that appear in Dataset A.",
        "Dataset B questions avoid multi-part interrogatives (e.g., 'What is X? What is Y?') that occur in 20% of Dataset A samples.",
        "Dataset B never references user-created content metadata (e.g., list descriptions, avatar URLs, follower counts) that are common in Dataset A.",
        "Dataset B questions use only basic comparison operators (> , =) without Dataset A's complex eligibility filters (e.g., 'eligible for trial', 'not a trialist').",
        "Dataset B lacks queries involving hierarchical relationships (e.g., 'director of most popular movie', 'critic likes per movie') present in Dataset A.",
        "Dataset B questions never specify output ordering limitations (e.g., 'top three movies', 'first created list') that appear in Dataset A.",
        "Dataset B shows repetitive query patterns (e.g., 6 instances of 'movie with highest rating score') while Dataset A maintains greater question diversity despite overlapping entities."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset A includes questions requesting listing of specific user reviews or comments (e.g., \"List all negative comments\"), while Dataset B does not.",
        "Dataset B includes queries about app size attributes (e.g., \"size less than 50M\"), which are absent in Dataset A.",
        "Dataset A integrates sentiment analysis metrics (e.g., sentiment polarity, subjectivity) across all samples, while Dataset B does not mention these metrics (except in one outlier).",
        "Dataset B frequently references specific app categories in uppercase (e.g., 'PERSONALIZATION'), whereas Dataset A uses lowercase or general category terms (e.g., \"puzzle games\").",
        "Dataset A includes questions about app update dates (e.g., \"not been updated since 2015\"), which are absent in Dataset B.",
        "Dataset B focuses on aggregating numerical data (e.g., average ratings) within categories, while Dataset A combines numerical thresholds with sentiment metrics (e.g., \"rating 4.5 and above with negative sentiment\").",
        "Dataset A includes queries about app compatibility with specific Android versions (e.g., \"Android ver 8.0\"), which Dataset B does not address.",
        "Dataset B uses exact numerical thresholds formatted with underscores or commas (e.g., 10,000,000+ installs), while Dataset A uses simpler numerical expressions (e.g., \"5000+ installs\").",
        "Dataset A frequently combines multiple non-numerical criteria (e.g., sentiment, reviews, compatibility) in queries, whereas Dataset B focuses on technical attributes (size, installs, categories).",
        "Dataset B includes questions about total size or average size of apps in specific categories (e.g., \"total size of travel and local apps\"), which are absent in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries do not mention sentiment analysis metrics (polarity, subjectivity) found in Dataset A.",
        "Dataset B does not request listing individual user reviews or comments, focusing instead on aggregate counts.",
        "Dataset B does not reference app metadata such as version numbers or update dates in its queries.",
        "Dataset B excludes questions about app pricing (free/paid distinctions) present in Dataset A.",
        "Dataset B does not query compatibility requirements (e.g., Android versions) seen in Dataset A.",
        "Dataset B lacks queries about app genres, focusing exclusively on categories unlike Dataset A.",
        "Dataset B uses simpler aggregate thresholds (e.g., '>4.0') without combining multiple complex conditions like Dataset A.",
        "Dataset B does not require percentage calculations involving temporal conditions (e.g., 'since 2018') present in Dataset A.",
        "Dataset B excludes queries about sales performance metrics ('best selling app') found in Dataset A.",
        "Dataset B does not combine sentiment scores with rating metrics in single queries as seen in Dataset A."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B questions more frequently request aggregated results across multiple apps (e.g., 'average rating of puzzle games') rather than individual app details",
        "Dataset B uses explicit threshold qualifiers in install counts (e.g., 'more than 1,000,000 times') more consistently than A",
        "Dataset B includes questions about app size requirements (e.g., 'more than 100 MB in size') not present in A",
        "Dataset B emphasizes popularity metrics in wording (e.g., 'most popular apps') while A uses neutral terms like 'most reviews'",
        "Dataset B combines multiple numeric filters in single questions more frequently (e.g., 'rating 4.5+ AND 1M+ installs')",
        "Dataset B contains more questions about category-level statistics (e.g., 'average rating of Adventure game category') than genre-level analysis",
        "Dataset B uses percentage questions about overall dataset characteristics (e.g., '% of apps with negative sentiment') rather than specific app groups",
        "Dataset B includes explicit ranking qualifiers in questions (e.g., 'by the number of reviews') more systematically than A",
        "Dataset B questions more frequently correlate install counts with ratings (e.g., 'apps with >1M installs AND >4.5 rating')",
        "Dataset B uses hierarchical listing structures (e.g., 'top categories... list apps under each') not found in A's flat listings"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries focus on numerical aggregations (e.g., counts, averages) without explicit mention of sentiment analysis components like polarity or subjectivity.",
        "Dataset B does not reference textual review content (e.g., comments, sentiments) directly, whereas Dataset A frequently requests sentiment polarity, subjectivity, or specific review text retrieval.",
        "Dataset B queries are structurally simpler, often requesting single aggregated values (e.g., average rating), while Dataset A combines multiple metrics (e.g., rating + sentiment score) in results.",
        "Dataset B does not include questions about app metadata such as update dates, version numbers, compatibility requirements, or price, which are present in Dataset A.",
        "Dataset B lacks references to percentages or ratios involving sentiment categories (e.g., positive/negative splits) common in Dataset A.",
        "Dataset B does not require grouping or analysis by non-categorical attributes like content rating (e.g., 'Teen') or temporal filters (e.g., 'not updated since 2018') seen in Dataset A.",
        "Dataset B queries prioritize categorical averages (e.g., 'average rating of [category] apps') over individual app attribute retrieval, unlike Dataset A's frequent requests for app names paired with specific metrics.",
        "Dataset B avoids questions involving sentiment subjectivity scores or nuanced sentiment classifications (e.g., 'neutral attitude', 'sentiment subjectivity of no more than 0.5') found in Dataset A.",
        "Dataset B does not include queries about combined conditions spanning multiple domains (e.g., 'free apps with rating >4.5 not updated since 2018') as seen in Dataset A.",
        "Dataset B omits references to install ranges (e.g., '5,000+ installs') and genre-specific thresholds (e.g., 'genre downloads >1B') present in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries focus exclusively on single metrics (e.g., average rating, count of reviews), while Dataset A often combines multiple metrics or criteria (e.g., rating + sentiment polarity) in a single query.",
        "Dataset B does not reference sentiment analysis metrics (e.g., polarity, subjectivity, or sentiment categories), which are central to many Dataset A queries.",
        "Dataset B includes explicit references to the 'playstore table,' a structural detail absent in Dataset A.",
        "Dataset B queries lack filters based on textual content in reviews (e.g., comments containing 'gr8'), which are present in Dataset A.",
        "Dataset B contains repetitive question structures (e.g., multiple variations of 'average rating for [category]'), whereas Dataset A questions are more varied in scope and complexity.",
        "Dataset B does not reference app update timelines (e.g., 'not updated since 2018'), a frequent filter in Dataset A.",
        "Dataset B includes queries about app size (e.g., 'size and minimum number of downloads'), which are absent in Dataset A.",
        "Dataset B does not request percentage or ratio calculations (e.g., 'percentage ratio between positive and negative sentiments'), which are common in Dataset A.",
        "Dataset B does not target age groups or content ratings (e.g., 'Teen'), which are explicit filters in Dataset A.",
        "Dataset B does not use ranking terms like 'top 5' or 'lowest rated,' which are frequent in Dataset A."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset A focuses on sentiment analysis (e.g., polarity, subjectivity) in queries, while Dataset B does not mention sentiment metrics.",
        "Dataset A includes percentage-based calculations (e.g., 'percentage of no comment reviews'), while Dataset B exclusively uses average calculations.",
        "Dataset A contains queries about app version compatibility and update timelines (e.g., 'not been updated since 2015'), while Dataset B does not reference temporal metadata.",
        "Dataset A specifies exact numerical thresholds for sentiment scores (e.g., 'sentiment review greater than 0.5'), while Dataset B focuses only on rating thresholds.",
        "Dataset A explicitly asks for user review content (e.g., 'list all of its reviews'), while Dataset B only references aggregated rating metrics.",
        "Dataset A uses higher install count thresholds (e.g., '5,000+') compared to Dataset B's smaller thresholds (e.g., '500').",
        "Dataset A includes price-related queries (e.g., 'most expensive app'), while Dataset B only distinguishes between free/paid apps without monetary values.",
        "Dataset A requires comparisons between sentiment categories (e.g., 'percentage ratio between positive and negative sentiments'), while Dataset B focuses solely on rating averages.",
        "Dataset A queries often combine 3+ attributes (e.g., category + rating + sentiment + install count), while Dataset B queries typically combine only 2 attributes.",
        "Dataset A explicitly requests ranked lists with numerical limits (e.g., 'top 5'), while Dataset B uses qualitative terms like 'top-rated' without specific quantity constraints."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries focus on aggregate averages (e.g., 'average rating of all apps in X category') while Dataset A includes granular details like individual app ratings, specific reviews, or sentiment metrics (e.g., 'sentiment polarity of Cooking Fever').",
        "Dataset B lacks explicit references to sentiment analysis metrics (e.g., 'sentiment polarity,' 'subjectivity') present in all Dataset A questions (e.g., 'neutral attitude,' 'sentiment subjectivity score').",
        "Dataset B exclusively uses the term 'PlayStore' or 'Play Store' in questions, while Dataset A omits platform-specific naming conventions.",
        "Dataset B questions frequently use the phrase 'average rating' as a standalone metric, whereas Dataset A combines rating thresholds with other conditions (e.g., 'rating 4.5 and above' + 'not updated since 2018').",
        "Dataset B queries focus on broad popularity thresholds (e.g., '10 million installs') compared to Dataset A\u2019s narrower install ranges (e.g., '5,000+ installs').",
        "Dataset B does not include compatibility filters (e.g., Android versions) or app update dates, which are common in Dataset A (e.g., 'not updated since 2015').",
        "Dataset B questions omit direct references to user comments or reviews (e.g., 'list all negative comments'), which are central to Dataset A queries.",
        "Dataset B emphasizes category-wide comparisons (e.g., 'top 5 in Action category') while Dataset A includes multi-genre or cross-category filters (e.g., 'apps with multiple genres').",
        "Dataset B uses exact numerical price thresholds (e.g., 'price less than 5 dollars'), whereas Dataset A refers to binary price classifications (e.g., 'free apps').",
        "Dataset B queries are structurally simpler, often requesting a single metric (e.g., 'average rating'), while Dataset A combines multiple metrics (e.g., 'rating + sentiment subjectivity') in a single question."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B questions are simpler, focusing on single metrics (e.g., counts, averages) without layered conditions seen in A (e.g., 'rating 4.5+ AND not updated since 2018')",
        "Dataset B lacks references to temporal metadata (e.g., update years, Android version compatibility) present in all A questions",
        "Dataset B does not require percentage calculations (e.g., 'percentage ratio between positive/negative sentiments') seen in A",
        "Dataset B never combines sentiment analysis with other attributes (e.g., 'sentiment subjectivity + rating + genre') as A consistently does",
        "Dataset B questions show repetition of identical structures (e.g., multiple 'average rating in X category' queries) absent in A",
        "Dataset B lacks questions about app pricing/commercial aspects (e.g., 'average price for dating apps') present in A",
        "Dataset B never requests version-specific information (e.g., 'current version') that appears in multiple A questions",
        "Dataset B contains table-specific references ('playstore table') not found in A's app-specific queries",
        "Dataset B uses simpler sentiment categorization (positive/negative) without A's granular scores (polarity 0.5+ thresholds, subjectivity metrics)",
        "Dataset B lacks compound genre requirements (e.g., 'multiple genres') that appear in A's filtering logic"
      ]
    }
  },
  "diffs_real_from_synth": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Queries in B explicitly request numerical limits in ranked results (e.g., 'top 5 professors') while A uses general ranking terms without specific quantities",
        "B includes questions combining multiple aggregation operations in a single query (e.g., 'sum of year 1 and year 2 students') whereas A focuses on single aggregations",
        "B contains direct numerical comparisons between specific entities (e.g., 'course no.16 vs course no.18') while A only uses numerical constraints without entity-vs-entity comparisons",
        "B uses average calculations (e.g., 'average number of courses taught by a professor') while A only uses total counts/sums",
        "B explicitly requests partial result sets (e.g., 'list any five of course IDs') whereas A queries always request complete results",
        "B employs complex multi-clause constraints (e.g., 'no more than two high-level or harder undergraduate courses') while A uses simpler single-operator constraints",
        "B combines course level with course type in filters (e.g., 'professional or master/undergraduate courses') while A only filters by course level alone",
        "B explicitly references faculty employment status in query targets (e.g., 'professor who is currently the member of faculty') while A uses employment status only as filters",
        "B requires combined returns of entity attributes (e.g., 'course ID and the level') in single answers while A typically requests single attributes per query",
        "B uses alternative terminology for identifiers (e.g., 'teacher no.79', 'person ID') whereas A consistently uses 'professor ID'/'student ID' terminology"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes aggregate functions beyond simple counts, such as averages and sums, whereas A only uses counts.",
        "Queries in B involve compound logical conditions (e.g., 'basic or medium undergraduate courses') using AND/OR operators, while A uses single criteria.",
        "B explicitly references program phases or years (e.g., 'pre-phase of qualification', '5th year') not present in A.",
        "B requires responses to include multiple attributes (e.g., course ID and level, professor ID and position status), whereas A typically requests single or paired attributes.",
        "Dataset B specifies detailed employment statuses (e.g., 'position status', 'faculty employees') beyond A's basic faculty/non-faculty distinction.",
        "B contains comparative queries (e.g., 'course no.16 or course no.18'), which are absent in A.",
        "Queries in B use explicit limits (e.g., 'list any five'), while A does not restrict response quantities.",
        "B includes course type granularity (e.g., 'professional', 'master/undergraduate'), whereas A uses generic levels like 'basic/medium/high-level'.",
        "Dataset B incorporates negation or existence conditions (e.g., 'non-faculty members not undergoing the phase'), which A lacks.",
        "B references indirect relationships (e.g., 'courses taught by advisors of student 376') requiring multi-step joins, unlike A's direct queries."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B uses the term 'faculty employees' instead of 'faculty member' or 'faculty affiliated position' used in A.",
        "Dataset B explicitly requests comparisons between specific entities (e.g., 'course no.16 or course no.18'), while A does not.",
        "Dataset B includes queries asking for a limited number of results (e.g., 'any five of course IDs'), whereas A does not specify result limits.",
        "Dataset B uses phrases like 'non-faculty members' and 'teacher no.79,' whereas A consistently uses 'professor' and 'professor ID.'",
        "Dataset B combines categorical filters with 'or' in numerical thresholds (e.g., 'basic or medium undergraduate courses'), while A typically uses single categories or 'AND' logic for combined thresholds.",
        "Dataset B explicitly references 'top N' results (e.g., 'top 5 professors'), which A does not.",
        "Dataset B includes direct arithmetic operations in queries (e.g., 'sum of year 1 and year 2 students'), whereas A uses aggregate functions like count or average without arithmetic combinations.",
        "Dataset B uses phrases like 'undergoing the pre-phase of qualification' as standalone conditions, while A integrates similar phases into broader criteria (e.g., 'students in the master/graduate phase').",
        "Dataset B frequently requests both identifiers and categorical attributes in the same output (e.g., 'course ID and the level of the course'), whereas A often asks for one or the other.",
        "Dataset B includes explicit negation of faculty status in filtering (e.g., 'professor who is not a faculty member'), while A focuses on affirmative faculty status checks."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes explicit references to faculty employment specifics (e.g., 'position status', 'faculty employees') in queries, while Dataset A refers only to general faculty status.",
        "Dataset B uses varied terminology for professors, such as 'teacher' or 'faculty employee', whereas Dataset A exclusively uses 'professor'.",
        "Dataset B frequently combines faculty status with other attributes (e.g., course level or advising roles) in single queries, while Dataset A isolates these conditions.",
        "Dataset B explicitly requests multi-field outputs (e.g., 'course ID and level'), whereas Dataset A typically asks for single fields or counts.",
        "Dataset B includes negation or exclusion criteria (e.g., 'non-faculty members', 'not undergoing the phase'), absent in Dataset A.",
        "Dataset B specifies exact numerical thresholds in conditions (e.g., 'more than 3 courses', 'no more than two'), while Dataset A uses broader terms like 'more than one'.",
        "Dataset B references specific academic years (e.g., '5th year', 'eighth year') as filters, whereas Dataset A uses phase names without numerical years.",
        "Dataset B requires positional or employment-related professor attributes (e.g., 'position in the university') in results, while Dataset A focuses on years or status.",
        "Dataset B uses compound aggregation queries (e.g., 'sum of year 1 and year 2 students'), whereas Dataset A aggregates singular metrics.",
        "Dataset B includes comparative queries (e.g., 'course no.16 or course no.18'), while Dataset A does not directly compare specific entities."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes aggregate functions beyond counting (e.g., SUM, AVG), while A only uses COUNT operations.",
        "Dataset B explicitly references comparative queries (e.g., 'course no.16 or course no.18'), whereas A does not compare entities directly.",
        "Dataset B contains queries requesting ranked results (e.g., 'top 5 professors'), while A lacks ranking or ordering constraints beyond basic filtering.",
        "Dataset B asks for metrics about course popularity (e.g., 'courses with the most number of professors'), which A does not address.",
        "Dataset B combines aggregation with multi-step conditions (e.g., 'no more than two high-level or harder undergraduate courses'), while A uses simpler aggregation criteria.",
        "Dataset B explicitly references faculty membership (e.g., 'member of faculty') as a standalone filter, whereas A uses position status more generically.",
        "Dataset B includes queries about non-faculty members with specific program phase conditions (e.g., 'not undergoing the phase of qualifications'), while A focuses on existence checks without phase linkages.",
        "Dataset B specifies program phases in greater detail (e.g., 'pre-phase of qualification'), whereas A uses broader terms like 'year of study'.",
        "Dataset B queries about courses taught by multiple professors (e.g., 'taught by more than 4 people'), which A does not mention.",
        "Dataset B uses explicit output limits (e.g., 'list any five'), while A does not restrict result sizes."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes aggregation functions (e.g., COUNT, SUM, AVG) in queries (e.g., 'How many courses', 'average number of courses') while A only retrieves direct lists or single values.",
        "Dataset B requires combining multiple independent criteria in filters (e.g., 'basic or medium undergraduate courses taught by faculty members') whereas A typically uses single filters per query.",
        "Dataset B contains ranking/limiting operations (e.g., 'top 5 professors', 'any five of course IDs') absent in A's queries.",
        "Dataset B explicitly compares numerical quantities between entities (e.g., 'course no.16 or course no.18') while A only requests absolute values.",
        "Dataset B queries frequently combine employment status with other attributes (e.g., 'faculty professor who taught course ID 104') as compound filters, unlike A's simpler status checks.",
        "Dataset B includes mathematical operations on results (e.g., 'sum of year 1 and year 2 students') not present in A's questions.",
        "Dataset B uses varied terminology for roles ('teacher', 'faculty employees', 'non-faculty members') where A consistently uses 'professor' with numerical IDs.",
        "Dataset B requires multi-step joins across 3+ entities with additional constraints (e.g., 'advisors who gave advice to student 376') while A's joins are simpler professor\u2194course mappings.",
        "Dataset B explicitly references student progress phases ('pre-phase of qualification', '5th year') as filters, unlike A's simpler status checks.",
        "Dataset B queries often demand grouped/statistical outputs (e.g., 'course with the most professors', 'average number of courses') whereas A focuses on direct record retrieval."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries more frequently use aggregation functions (e.g., 'how many', 'average', 'sum') as the primary objective, whereas Dataset A typically uses aggregation only to support filtering or ranking.",
        "Dataset B includes explicit requests for positional or hierarchical statuses (e.g., 'position status', 'faculty employees'), while Dataset A focuses on temporal or experiential attributes (e.g., 'years in program').",
        "Dataset B contains queries comparing numerical values between entities (e.g., 'course no.16 or course no.18') or ranking results (e.g., 'top 5 professors'), which are absent in Dataset A.",
        "Dataset B explicitly references student-advisor relationships and student-specific attributes (e.g., 'students who are in the 5th year'), whereas Dataset A focuses solely on professor-course relationships.",
        "Dataset B includes queries about non-faculty members or conditional exclusions (e.g., 'professor who is not a faculty member'), while Dataset A filters only within faculty statuses or program durations.",
        "Dataset B frequently combines course levels with administrative phases (e.g., 'pre-phase of qualification', 'eighth year of program'), whereas Dataset A uses simpler temporal thresholds (e.g., 'at least 3 years').",
        "Dataset B uses multi-part answer formats (e.g., 'State the course ID and the level'), whereas Dataset A requests single attributes per query.",
        "Dataset B includes queries about indirect relationships (e.g., 'courses taught by the advisors who gave advice to student with ID 376'), while Dataset A focuses on direct professor-course mappings.",
        "Dataset B explicitly limits result sets (e.g., 'list any five of course IDs'), whereas Dataset A requests complete lists without truncation.",
        "Dataset B incorporates compound numerical thresholds (e.g., 'no more than two high-level courses'), while Dataset A uses simpler inequalities (e.g., 'more than 5 years')."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries that explicitly request ranking or top N results (e.g., 'top 5 professors'), while A does not.",
        "Queries in B frequently combine multiple course level categories in a single question (e.g., 'basic or medium undergraduate'), whereas A typically references single categories like 'undergraduate' or 'graduate'.",
        "Dataset B contains explicit comparisons between specific course IDs (e.g., 'course no.16 or course no.18'), while A only compares abstract attributes like difficulty levels.",
        "B includes numerical thresholds in aggregation constraints (e.g., 'more than 3 courses', 'no more than two'), whereas A only uses basic counts without such thresholds.",
        "Queries in B often require listing both course/professor IDs and associated metadata (e.g., 'course ID and level') in results, while A typically requests singular attributes.",
        "Dataset B uses compound faculty/position filters (e.g., 'faculty employees', 'position status in faculty'), whereas A only checks basic faculty membership status.",
        "B includes queries about average metrics (e.g., 'average number of courses taught'), while A focuses exclusively on counts and existence checks.",
        "Dataset B explicitly distinguishes between faculty and non-faculty professors in filtering, while A only references general faculty status without this dichotomy.",
        "Queries in B frequently combine academic phases/years with advisor relationships (e.g., '5th year of their program'), whereas A handles these attributes separately.",
        "B contains explicit requests for partial results (e.g., 'list any five of course IDs'), while A queries always request complete result sets."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries frequently request media URLs/links (user avatar URLs, rating links) not present in A",
        "Dataset B includes explicit requests for descriptive text fields (list descriptions, critic text) while A focuses on structured metadata",
        "Dataset B contains queries about critic-related data (critic counts, critic-associated likes) absent in A",
        "Dataset B requires combining multiple distinct data points (director + release year + conditional rating) in single responses more frequently than A",
        "Dataset B references list update timestamps for recency checks while A only uses creation timestamps",
        "Dataset B uses 'eligible for trial' status as a temporal condition during list creation, whereas A uses static subscription states (trialist/subscriber)",
        "Dataset B explicitly tracks 'likes' as a social metric connected to critics/ratings, which A doesn't reference",
        "Dataset B queries include movie popularity as a specific numeric metric (>1000 popularity), while A uses 'popularity' more generally",
        "Dataset B requires counting entities with attached conditional sub-queries (e.g., 'highest likes per critic') unlike A's simpler counts",
        "Dataset B asks for director identification through both name and ID in different queries, while A only references director names"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries that reference user subscription statuses or eligibility (e.g., 'subscriber', 'trialist', 'eligible for trial') as filtering criteria, absent in A.",
        "Dataset B explicitly requests textual metadata like user-generated list descriptions (e.g., 'description of user 85981819's movie list'), not observed in A's samples.",
        "Dataset B contains queries about user-specific content creation contexts (e.g., 'created when he/she was eligible for trial'), whereas A focuses purely on user IDs without contextual states.",
        "Dataset B requires percentage calculations (e.g., 'percentage of rated movies released in 2021'), while A focuses on absolute counts or averages.",
        "Dataset B includes questions about critic-specific metrics (e.g., 'critics were given to the movie', 'critic made by the user'), which are absent in A.",
        "Dataset B asks for user portrait URLs (e.g., 'user avatar url') alongside content metrics, whereas A only references list/movie URLs.",
        "Dataset B combines multiple aggregation layers in single queries (e.g., 'director of the most popular movie... and indicate average rating score of trialist users'), while A typically isolates one aggregation per question.",
        "Dataset B explicitly requests 'recently' updated/rated timestamps (e.g., 'rated recently', 'updated most recently'), whereas A focuses on creation dates without recency emphasis.",
        "Dataset B includes conditional logic on user actions (e.g., 'rated the movie when they were on a trialist'), while A's conditions are based purely on numerical thresholds.",
        "Dataset B queries involve hierarchical relationships between entities (e.g., 'highest amount of likes that each critic per movie has received'), requiring nested aggregations not seen in A."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries in B explicitly request user profile images (e.g., portrait picture, avatar URL), while A does not reference visual user metadata",
        "B includes direct references to critic-specific metrics (e.g., 'critics were given', 'critic made by the user'), whereas A focuses exclusively on general ratings",
        "B requires retrieval of textual list descriptions and rating commentary, while A queries only reference quantitative list attributes",
        "B contains percentage-based calculations (e.g., 'percentage of rated movies') as distinct aggregation types not present in A's samples",
        "B explicitly requests director IDs as separate entities, while A only references director names",
        "B introduces 'likes' as a distinct engagement metric tied to critics/reviews, unlike A's exclusive focus on ratings and followers",
        "B uses absolute date comparisons (e.g., 'after 2010/1/1') for time filters, whereas A employs relative time ranges like 'last month'",
        "B applies numeric constraints on list contents (e.g., 'less than 50 movies in the list'), while A only filters based on follower counts or activity metrics",
        "B frequently combines multiple disparate data types (e.g., director ID + release year + average rating) in single requests, whereas A questions typically focus on singular metrics",
        "B includes references to external rating/critic links as retrievable URLs, a feature absent in A's metadata requirements"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries include requests for multimedia attributes (e.g., 'portrait picture', 'user avatar url', 'links to ratings') not present in A.",
        "Dataset B uses percentage calculations (e.g., 'percentage of rated movies') while A focuses exclusively on counts/averages.",
        "Dataset B explicitly references temporal user status conditions during specific actions (e.g., 'when he/she was eligible for trial') unlike A's static role filters.",
        "Dataset B contains nested aggregation requirements (e.g., 'highest amount of likes that each critic per movie has received') absent in A's simpler aggregations.",
        "Dataset B includes exact date comparisons (e.g., 'created after 2010/1/1') while A only uses year-based date filters.",
        "Dataset B queries user-generated list metadata descriptions (e.g., 'description of user's movie list') not referenced in A.",
        "Dataset B requires exact string matches for rating values (e.g., 'rating of \"4\"') whereas A uses numerical threshold operators.",
        "Dataset B combines director identification with popularity metrics and critic engagement in single queries, unlike A's separate handling of these attributes.",
        "Dataset B explicitly requests existence checks (e.g., 'if there's any') for subqueries/correlated data, which A does not require.",
        "Dataset B queries focus on conditional counts based on dual status exclusions (e.g., 'users who were not trialists AND gave \u22642 ratings') while A uses simpler single-condition filters."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in dataset B explicitly request URLs or links (e.g., user avatars, rating links), while A does not reference external resources.",
        "Dataset B includes questions requiring percentage calculations (e.g., 'percentage of rated movies'), whereas A only uses absolute counts or averages.",
        "Questions in B frequently combine multiple output attributes in a single query (e.g., director + release date + conditional average rating), while A focuses on single attributes per query.",
        "Dataset B introduces the concept of 'critics' and critic-related metrics (e.g., critic counts, likes per critic), which are absent in A.",
        "Queries in B explicitly reference textual metadata fields like list/movie 'descriptions,' whereas A focuses on numerical or identifier-based attributes.",
        "Dataset B includes conditional checks for data existence (e.g., 'if there's any') and multi-step logic (e.g., 'indicate X if Y'), which A lacks.",
        "B requires granular temporal constraints on user statuses (e.g., 'when he/she was eligible for trial'), whereas A uses static status filters (e.g., 'subscribers').",
        "Questions in B specify exact ranking positions (e.g., 'top three movies'), while A uses general superlatives like 'most popular' or 'highest.'",
        "Dataset B includes composite filters combining list properties and user statuses (e.g., 'created by subscribers AND with <50 movies'), whereas A uses simpler single-condition filters.",
        "B explicitly requests director IDs and ties directors to metrics like 'most movies directed,' while A does not mention director-specific identifiers or rankings."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries frequently request user media attributes (e.g., portrait pictures, avatar URLs) not referenced in Dataset A",
        "Dataset B includes queries about directors (names/IDs) and their movie associations, while Dataset A lacks director-related attributes",
        "Dataset B contains explicit percentage calculations (e.g., 'percentage of rated movies') not found in Dataset A's aggregate functions",
        "Dataset B requests external resource links (e.g., rating links, avatar URLs) absent in Dataset A queries",
        "Dataset B incorporates critic-related metrics (number of critics, critic likes) not present in Dataset A's rating system",
        "Dataset B uses precise date format filters (e.g., 'after 2010/1/1') whereas Dataset A employs relative time references (e.g., 'last year')",
        "Dataset B combines multiple conditional clauses in single queries (e.g., subscriber status AND movie count thresholds) more frequently than Dataset A",
        "Dataset B explicitly references 'likes' as a popularity metric for critics/movies, unlike Dataset A which only uses followers/ratings",
        "Dataset B queries director IDs and creator IDs as distinct attributes, while Dataset A only references user IDs and list IDs",
        "Dataset B includes compound metric requests (e.g., 'indicate highest likes per critic') whereas Dataset A focuses on singular aggregate values"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries about user profile attributes (e.g., portrait picture, avatar URL) not present in A.",
        "Dataset B explicitly references directors (e.g., director names, director IDs) and their relationships to movies, while A does not.",
        "Dataset B requests URLs or links (e.g., user avatar URLs, rating links) as output, which A does not require.",
        "Dataset B contains percentage-based calculations (e.g., 'percentage of rated movies released in 2021'), absent in A.",
        "Dataset B combines multiple distinct metrics in single queries (e.g., director name + release date + average rating), whereas A focuses on single metrics per query.",
        "Dataset B explicitly references 'critics' and critic-related metrics (e.g., 'critics were given to the movie'), which A does not mention.",
        "Dataset B includes the term 'likes' as a popularity metric (e.g., 'number of likes'), whereas A uses 'followers' or generic popularity measures.",
        "Dataset B uses granular temporal conditions for movie releases (e.g., 'released on 1976'), while A uses broader date ranges (e.g., 'after 2010/1/1').",
        "Dataset B requires exact user IDs (e.g., user 85981819) in questions, while A uses placeholders like 'user1' or omits specific IDs.",
        "Dataset B includes multi-part conditional aggregations (e.g., 'highest amount of likes that each critic per movie has received'), whereas A uses simpler aggregations like counts or averages."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries frequently request multimedia or URL attributes (e.g., portrait pictures, avatar URLs, list descriptions)",
        "Dataset B contains explicit references to user account statuses (e.g., trialist, subscriber, eligible for trial)",
        "Dataset B questions often require combined entity relationships (e.g., directors linked to movie popularity metrics)",
        "Dataset B includes percentage calculations (e.g., percentage of rated movies from specific year)",
        "Dataset B queries specify temporal account states (e.g., 'when created the list', 'when they rated')",
        "Dataset B contains explicit requests for unique identifiers (IDs) across multiple entity types",
        "Dataset B questions frequently combine multiple distinct data points in single queries (e.g., director name + release date + average rating)",
        "Dataset B includes conditional logic based on user subscription states in filtering criteria",
        "Dataset B references specific content formats (e.g., critic reviews, links to ratings)",
        "Dataset B queries utilize more complex popularity metrics (e.g., 'most movie popularity number', 'highest amount of likes per critic')"
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about user sentiment metrics (e.g., sentiment polarity, sentiment subjectivity), which are absent in Dataset A.",
        "Dataset B explicitly requests retrieval of user review text (e.g., 'List all of its reviews'), whereas Dataset A focuses only on structured attributes like ratings and installs.",
        "Dataset B contains questions about app update timelines (e.g., 'not been updated since 2018'), a temporal dimension not present in Dataset A.",
        "Dataset B references technical metadata like Android version compatibility (e.g., 'Android ver 8.0 and above'), which Dataset A does not address.",
        "Dataset B asks for app-specific sentiment breakdowns (e.g., 'neutral attitude', 'negative comments'), while Dataset A lacks sentiment granularity in queries.",
        "Dataset B uses the term 'genre' interchangeably with or alongside 'category' (e.g., 'multiple genres'), whereas Dataset A exclusively uses 'category'.",
        "Dataset B includes explicit requests for percentages/ratios involving sentiment metrics (e.g., 'percentage ratio between positive sentiments and negative sentiments'), unlike Dataset A's focus on install/rating percentages.",
        "Dataset B queries often combine sentiment analysis with version/update data (e.g., 'current version'), while Dataset A combines attributes like price and category.",
        "Dataset B requires aggregation of sentiment scores (e.g., 'total sentiment polarity score'), whereas Dataset A aggregates numerical attributes like ratings or install counts.",
        "Dataset B contains questions about app-specific metadata like 'current version' and 'best selling app', which are absent in Dataset A's scope."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries specifically analyze sentiment polarity scores (e.g., -1 polarity, >0.5 subjectivity) as distinct metrics, while A only references general sentiment polarity/subjectivity without granular scoring.",
        "Dataset B explicitly calculates percentage ratios (e.g., positive/negative sentiment ratios) and proportional metrics, whereas A focuses on absolute counts or averages.",
        "Dataset B includes queries about app update timelines (e.g., 'not updated since 2018') as a filter condition, which doesn't appear in A's samples.",
        "Dataset B requests direct extraction of user comments (e.g., 'list all negative comments') rather than A's focus on quantitative sentiment counts.",
        "Dataset B incorporates version-specific metadata (e.g., 'current version') in results, while A doesn't reference app versions.",
        "Dataset B queries combine sentiment metrics with install ranges (e.g., '5,000+ installs') and price tiers, creating multi-dimensional filters not seen in A.",
        "Dataset B specifically targets apps with textual patterns in reviews (e.g., 'contain \"gr8\"'), while A lacks free-text analysis in queries.",
        "Dataset B uses sentiment subjectivity scores as quantitative thresholds (e.g., 'subjectivity <=0.5'), whereas A only mentions subjectivity as a general concept.",
        "Dataset B explicitly tracks neutral sentiment counts as a distinct metric, while A focuses on positive/negative polarity.",
        "Dataset B queries frequently require identification of extremes (e.g., 'most expensive app', 'highest -1 polarity') through comparative analysis, unlike A's focus on category averages."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes questions targeting specific app names (e.g., \"Garden Coloring Book\", \"Dragon Ball Legends\"), while A does not reference individual apps by name.",
        "Dataset B explicitly requests exact counts of user sentiment categories (e.g., \"how many users dislike\", \"neutral reviews\"), whereas A focuses on aggregate sentiment metrics like averages or comparative percentages.",
        "Dataset B contains questions about content ratings (e.g., \"Teen\" content rating apps), while A does not reference content rating categories.",
        "Dataset B includes queries about text patterns in user reviews (e.g., \"have 'gr8' in their comments\"), while A lacks textual pattern matching requirements.",
        "Dataset B asks for metrics related to multi-genre apps (e.g., \"apps have multiple genres\"), whereas A only references single-genre filtering.",
        "Dataset B requires direct association of sentiment scores with pricing data (e.g., \"average price for a dating application\"), which A does not include.",
        "Dataset B explicitly requests identification of lowest-rated apps (e.g., \"top 5 lowest rated puzzle games\"), while A focuses exclusively on top/highest-rated apps.",
        "Dataset B contains questions about Android version compatibility constraints (e.g., \"compatible with Android ver 8.0 and above\"), a dimension absent in A.",
        "Dataset B includes queries about sentiment subjectivity scores (e.g., \"total Sentiment subjectivity score\"), whereas A only references sentiment polarity metrics.",
        "Dataset B requires percentage calculations for specific sentiment ratios (e.g., \"percentage ratio between positive sentiments and negative sentiments\"), while A uses simpler comparative operators for sentiment analysis."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes questions about specific text mentions in reviews (e.g., 'gr8' in comments), while A focuses purely on numerical/text sentiment categorization.",
        "Dataset B explicitly queries sentiment subjectivity scores and polarity values (e.g., 'sentiment subjectivity of 0.5'), whereas A only references basic sentiment categories (positive/neutral/negative).",
        "Dataset B contains questions about app update timelines (e.g., 'not updated since 2018'), a temporal aspect absent in A.",
        "Dataset B requires direct listing of individual user reviews/comments (e.g., 'list all negative comments'), while A only aggregates review sentiment counts.",
        "Dataset B references content rating categories (e.g., 'Teen') that are not mentioned in A's questions.",
        "Dataset B includes questions about technical specifications like Android version compatibility ('compatible with Android ver 8.0'), which A never addresses.",
        "Dataset B explicitly asks for percentage ratios between sentiment types (e.g., 'percentage ratio between positive and negative sentiments'), while A only calculates percentages within single dimensions.",
        "Dataset B contains questions about app pricing details (e.g., 'average price for dating applications'), a financial aspect absent in A.",
        "Dataset B queries metadata about app versions (e.g., 'current version'), which A's questions never reference.",
        "Dataset B specifically requests identification of apps with multi-genre categorization, while A only deals with single-category analysis."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries requiring the listing of specific reviews or comments (e.g., 'List all of its reviews'), while A focuses only on aggregated counts or averages.",
        "Dataset B explicitly incorporates sentiment analysis metrics (e.g., 'sentiment polarity', 'sentiment subjectivity') in questions, whereas A limits sentiment categorization to basic labels like 'positive/neutral'.",
        "Dataset B combines multiple distinct attributes in single queries (e.g., 'rating and total sentiment subjectivity score'), while A typically isolates one attribute per question.",
        "Dataset B includes percentage-based calculations tied to conditional thresholds (e.g., 'percentage of no comment reviews'), whereas A uses simpler percentage calculations like average ratings.",
        "Dataset B references content ratings (e.g., 'Teen') as filtering criteria, which are absent in A's questions.",
        "Dataset B requires ranking functions (e.g., 'top 5', 'lowest rated') and explicit ordering, whereas A focuses on singular extremes (e.g., 'highest rating').",
        "Dataset B explicitly queries apps with version-specific compatibility constraints (e.g., 'Android ver 8.0 and above') alongside sentiment data, while A only references version timelines without combining with other metrics.",
        "Dataset B includes queries about apps with multi-genre categorization (e.g., 'apps have multiple genres'), whereas A focuses on single-genre/category grouping.",
        "Dataset B calculates ratios between sentiment categories (e.g., 'percentage ratio between positive sentiments and negative sentiments'), while A counts sentiment categories independently.",
        "Dataset B frequently combines temporal filters (e.g., 'not updated since 2018') with sentiment or percentage criteria, whereas A uses temporal filters in isolation (e.g., 'not updated since 2015')."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes questions about user reviews and sentiments (e.g., sentiment polarity, subjectivity, or comment analysis), while A does not reference reviews or sentiments.",
        "Dataset B explicitly requests percentages (e.g., 'percentage of no comment reviews') in queries, whereas A uses absolute counts (e.g., 'more than 500 installs').",
        "Dataset B references app versions (e.g., 'current version') and compatibility constraints (e.g., 'Android ver 8.0'), while A does not mention versioning.",
        "Dataset B includes queries about app genres (e.g., 'multiple genres') alongside or instead of categories, while A exclusively uses categories.",
        "Dataset B requires sentiment analysis metrics (e.g., 'sentiment polarity score') as output, whereas A focuses on numerical metrics like ratings or install counts.",
        "Dataset B asks for monetization details (e.g., 'average price for a dating application'), while A focuses solely on free/paid status without pricing specifics.",
        "Dataset B includes questions about app update timelines (e.g., 'not been updated since 2018'), while A does not reference update dates.",
        "Dataset B queries often combine sentiment attributes (e.g., 'negative sentiments') with performance metrics, whereas A combines category, rating, and install count filters.",
        "Dataset B explicitly asks for counts or ratios of user attitudes (e.g., 'neutral reviews'), while A focuses on aggregate metrics like average ratings.",
        "Dataset B includes queries about extremes in sentiment (e.g., 'highest sentiment polarity score'), whereas A focuses on extremes in ratings or install counts."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries more frequently request individual app details (e.g., \"What is the rating for 'Garden Coloring Book'?\") rather than aggregated category-wide metrics like Dataset A",
        "Dataset B includes explicit requests for sentiment subjectivity metrics (e.g., \"total Sentiment subjectivity score\") that don't appear in Dataset A",
        "Dataset B contains queries about exact comment content analysis (e.g., \"apps that have 'gr8' in their comments\") while Dataset A focuses on sentiment polarity without text pattern matching",
        "Dataset B specifically references content rating categories (e.g., \"Teen\" content rating) not mentioned in Dataset A queries",
        "Dataset B includes queries about multi-genre apps and their combined sentiment metrics (\"apps have multiple genres and total sentiment subjectivity\"), unlike Dataset A's single-genre focus",
        "Dataset B requires exact numerical counts of specific sentiment types (e.g., \"How many neutral reviews\") while Dataset A uses relative thresholds like \"most positive reviews\"",
        "Dataset B asks for direct percentage ratios between sentiment types (e.g., \"percentage ratio between positive and negative sentiments\") whereas Dataset A focuses on simple percentage calculations",
        "Dataset B queries combine version compatibility checks with sentiment analysis (e.g., \"Android ver 8.0 and above + user sentiment\"), a pairing not seen in Dataset A",
        "Dataset B includes explicit requests to list individual reviews/comments (e.g., \"List all negative comments\") that never appear in Dataset A's aggregation-focused queries",
        "Dataset B contains queries about temporal sentiment comparisons (e.g., \"percentage of apps not updated since 2015 with more negative sentiment\") while Dataset A's temporal filters don't connect to sentiment analysis"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Queries in dataset B specifically request listing or enumerating individual reviews (e.g., 'List all of its reviews'), while A focuses only on aggregated sentiment metrics like counts or polarity scores.",
        "Dataset B includes explicit percentage-based calculations (e.g., 'percentage of no comment reviews') directly in queries, whereas A primarily uses counts or averages without percentage ratios.",
        "Dataset B references granular metadata like app compatibility with specific Android versions (e.g., 'Android ver 8.0 and above'), which is absent in A's questions.",
        "Queries in B frequently combine multiple distinct metrics in a single question (e.g., 'rating and how many users dislike this App'), while A\u2019s questions focus on single metrics per query.",
        "Dataset B includes explicit filters based on text patterns in reviews (e.g., 'comments containing \"gr8\"'), whereas A does not reference textual content within reviews.",
        "B\u2019s questions explicitly reference app pricing or monetization (e.g., 'average price for a dating application'), while A lacks monetary comparisons.",
        "Dataset B requires correlating update years with sentiment outcomes (e.g., 'not been updated since 2018... percentage having negative sentiment'), whereas A only filters by update year without sentiment linkage.",
        "B\u2019s queries include version-specific details (e.g., 'current version' of an app), which are absent in A\u2019s questions.",
        "Dataset B explicitly asks for genre/category lists based on install thresholds (e.g., 'genre with downloads >1,000,000,000'), while A focuses on category-based aggregations without install conditions.",
        "Queries in B combine sentiment subjectivity scores with other metrics (e.g., 'total sentiment subjectivity of these apps'), whereas A only references polarity or sentiment classifications (positive/negative/neutral)."
      ]
    }
  }
}