{
  "sims": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query relationships between professors and courses they teach.",
        "Both involve filtering data based on specific course levels (e.g., basic, medium, Level_500).",
        "Both require counting instances (e.g., courses, professors, students) under specific conditions.",
        "Both reference professor IDs and student IDs as key identifiers.",
        "Both include questions about faculty/department affiliations or employment status of professors.",
        "Both involve aggregations (e.g., totals, min/max) for numerical attributes like years in program.",
        "Both use conditional logic (e.g., 'no more than two', 'at least one', 'not') in queries.",
        "Both require joins across entities (e.g., professors linked to students via advising/teaching).",
        "Both ask for lists of IDs paired with categorical attributes (e.g., course levels, position statuses).",
        "Both include queries about the intersection of entities (e.g., courses taught by multiple professors)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query courses taught by specific professors using professor IDs.",
        "Both include questions about course levels (e.g., 'basic or medium undergraduate' in A, 'Level_500' in B).",
        "Both involve counting entities (e.g., courses, professors, students) with aggregation phrases like 'how many'.",
        "Both reference professor-student advisor relationships (e.g., 'advisor IDs' in A, 'advised by professor' in B).",
        "Both require filtering results by numeric ID ranges or specific IDs (e.g., 'person IDs from 40 to 50' in A, 'professor with ID 234' in B).",
        "Both ask for lists of course IDs linked to professors or students.",
        "Both include queries about extremal values (e.g., 'most courses' in A, 'highest yearsInProgram' in B).",
        "Both use precise database identifiers like 'course ID', 'person ID', and 'student ID'.",
        "Both contain questions about professor employment status (e.g., 'faculty member' in A, 'professors in the program' in B).",
        "Both focus on granular relationships between entities (e.g., courses taught vs. advised students)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets query course levels (e.g., basic, medium, high-level, Level 300) as a key attribute.",
        "Both involve filtering professors based on faculty affiliation or position status (e.g., 'faculty member,' 'Faculty_eme').",
        "Both include questions counting courses taught under specific conditions (e.g., 'no more than two,' 'at least two').",
        "Both reference numerical IDs (person, professor, course, student) as primary identifiers for entities.",
        "Both link professors to students via advisor relationships (e.g., 'advisor IDs,' 'advised by').",
        "Both use comparative quantifiers (e.g., 'most,' 'least,' 'highest') to analyze course or teaching metrics.",
        "Both filter queries based on student academic progress (e.g., 'year in program,' 'master/graduate phase').",
        "Both apply numerical ranges or inequalities (e.g., 'ID 40 to 50,' 'greater than 10 and less than or equal to 15').",
        "Both require joining professor, course, and student entities (e.g., 'taught by,' 'advised by').",
        "Both request lists of IDs paired with attributes like course level, position, or student year."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets query the number of courses taught by professors using conditions like course level or faculty status.",
        "Both involve filtering courses by specific levels (e.g., basic, medium, high-level, graduate, or numeric levels).",
        "Both include requests to list course IDs based on criteria like level, professor ID, or faculty affiliation.",
        "Both reference professor/teacher IDs to retrieve course or advising details (e.g., \"teacher no.79\" in A and \"professor with ID 123\" in B).",
        "Both query student program phases (e.g., \"5th year\" in A and \"phase 2\" in B) for aggregation or filtering.",
        "Both ask for counts or lists of students advised by professors or other students (e.g., advisor IDs in A and \"advised by at least one other student\" in B).",
        "Both check professor attributes (e.g., faculty membership in A vs. program tenure in B) to filter results.",
        "Both use aggregation functions (e.g., COUNT, GROUP BY) to quantify courses, professors, or students.",
        "Both request course-level details for specific course IDs (e.g., \"course 165\" in A and \"course ID 10\" in B).",
        "Both reference professional/graduate-level courses (e.g., \"master/graduate\" in A and \"graduate level\" in B)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query course levels (e.g., basic, medium, high-level, professional) associated with specific course IDs or professors.",
        "Both involve filtering or counting courses taught by professors identified via person IDs or names.",
        "Both require joining professor/teacher entities with course entities to retrieve teaching relationships.",
        "Both include queries about advisor-advisee relationships between professors and students, often using professor/student IDs.",
        "Both reference numerical identifiers (e.g., person IDs, course IDs) as primary keys for data retrieval.",
        "Both ask for counts or lists of courses based on criteria like course level, instructor ID, or faculty status.",
        "Both involve queries about faculty membership or position status (e.g., faculty professors, professors with no position).",
        "Both include conditional logic (e.g., 'more than 5 years', 'no more than two courses') in numerical or categorical filters.",
        "Both focus on granular attributes like student years in programs (e.g., 3rd year, 8th year) and their advisor links.",
        "Both require resolving many-to-many relationships (e.g., professors teaching multiple courses, students having multiple advisors)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets focus on querying relationships between professors and the courses they teach.",
        "Both datasets utilize specific numerical identifiers (e.g., course IDs, professor IDs, student IDs) for entities.",
        "Both include questions about course levels (e.g., 'basic,' 'medium,' 'high-level' in A; 'Level_400,' 'Level_500' in B).",
        "Both involve filtering or counting courses based on categorical criteria (e.g., difficulty level, program year).",
        "Both require joining professor data with course data to answer questions (e.g., identifying who teaches a course).",
        "Both contain queries about faculty membership or employment status of professors (explicitly in A, implicitly in B).",
        "Both reference advisor-advisee relationships between professors and students.",
        "Both include questions about the duration of professors' or students' involvement in programs (e.g., 'years in program').",
        "Both datasets use exact numerical filters (e.g., 'ID 303,' 'course no.9') in queries.",
        "Both involve aggregations (e.g., 'how many,' 'most,' 'least') to quantify results."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets query courses taught by professors or faculty members.",
        "Both reference course levels (e.g., basic, medium, high-level, masters, professional).",
        "Both include questions about professor identifiers (e.g., person IDs, professor IDs, names).",
        "Both use numerical thresholds (e.g., \u22655 years, \u22642 courses, specific year in program).",
        "Both check faculty/program membership status of professors (e.g., 'member of faculty').",
        "Both ask about the number of courses taught or advised by professors.",
        "Both involve filtering by specific course IDs or professor IDs (e.g., course 165, professor ID 297).",
        "Both include conditional logic based on course attributes (e.g., level, difficulty).",
        "Both use aggregation functions (e.g., count, max) to analyze course/professor data.",
        "Both relate professors to student advising roles (e.g., advisor IDs, years in program)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets focus on querying course levels (e.g., 'basic', 'medium', 'undergraduate', 'graduate', 'high-level').",
        "Both include questions about professors/teachers and their association with courses (e.g., teaching assignments).",
        "Both require linking entities via IDs (e.g., professor IDs, student IDs, course IDs).",
        "Both involve checking faculty/employment status of professors (e.g., 'faculty member', 'position status').",
        "Both ask for numerical aggregations (e.g., 'how many courses', 'total number of students').",
        "Both include queries about student-advisor relationships and program year status (e.g., '5th year', '8th year').",
        "Both filter results based on categorical attributes (e.g., 'basic or medium', 'high-level or harder', 'Phase 3 students').",
        "Both retrieve specific identifiers (e.g., 'person IDs', 'course IDs', 'advisor IDs') as primary outputs.",
        "Both ask for counts of courses taught by professors under specific conditions (e.g., 'no more than two', 'more than 5 years').",
        "Both include requests to list professors based on teaching criteria (e.g., 'most courses', 'least courses', 'specific courseLevel')."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query user-specific data using user IDs and attributes like subscription status or payment method.",
        "Questions in both datasets require aggregations such as counts, averages, percentages, and max/min values.",
        "Filters based on criteria like release year, rating scores, list titles, or timestamps are common in both.",
        "Both reference movie lists with attributes like titles, followers, creation/update dates, and movie counts.",
        "Both inquire about user subscription status (trialist vs. paying) in the context of actions like list creation or ratings.",
        "Directors are frequently queried in both datasets, including counts of their movies and average ratings.",
        "Both require joining entities (e.g., users to ratings, movies to directors, lists to creators) for answers.",
        "Handling ties in rankings (e.g., multiple entries with the same value) by listing all and adding criteria is present in both.",
        "Time-based constraints (e.g., release year, list creation date) are used for filtering in both datasets.",
        "Both ask for ranked or top entries (e.g., highest-rated movies, lists with the most followers, top N results)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets include queries about user-created movie lists, such as list titles, followers, and creation/update timestamps.",
        "Questions in both datasets frequently request counts (e.g., number of movies, users, lists, or followers).",
        "Both datasets ask for movie details like titles, release years, and director names.",
        "Queries in both datasets focus on rating scores, including identifying movies with the highest ratings.",
        "Both datasets involve filtering results based on release years (e.g., movies released in a specific year).",
        "Questions in both datasets reference specific user IDs for actions like list creation or ratings.",
        "Both datasets request URLs for movies, ratings, lists, or director pages.",
        "Queries in both datasets involve popularity metrics (e.g., \"most popular movie\").",
        "Both datasets include timestamps (e.g., creation/update dates for lists or movie release dates).",
        "Questions in both datasets ask about followers (e.g., lists with the most followers or follower counts)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets include queries about user-specific movie ratings and interactions using user IDs.",
        "Both datasets involve filtering movies by release year or time periods (e.g., 2021, 2003, 2017).",
        "Queries in both datasets reference movie lists with titles, follower counts, and update timestamps.",
        "Both datasets require aggregations (e.g., counts, averages, percentages) for numerical analysis.",
        "Questions in both datasets focus on subscription status (e.g., paying subscribers, trialists) during user actions.",
        "Both include requests for URLs related to movies, ratings, lists, or director pages.",
        "Queries in both datasets involve movie popularity metrics and rating scores (e.g., scores of 1-5).",
        "Both datasets ask for director-specific information (e.g., directors with the most movies, director names).",
        "Questions in both datasets combine multiple conditions (e.g., release year + director + user status).",
        "Both datasets use explicit identifiers like movie IDs, list titles, and user IDs for granular filtering."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets involve queries about movie ratings, including specific scores or aggregated metrics (e.g., averages, counts).",
        "Queries in both datasets filter results based on movie release years (e.g., 2021, 2000).",
        "Both include questions about movie popularity metrics and rankings (e.g., 'most popular,' 'highest rated').",
        "Directors are referenced as criteria for filtering movies in both datasets (e.g., Steven Spielberg, Christopher Nolan).",
        "User engagement is analyzed through metrics like followers, list creation, or rating activity in both datasets.",
        "Aggregate functions (e.g., COUNT, AVG) are used in queries to analyze data across movies, users, or lists.",
        "Temporal attributes (e.g., creation/update timestamps, rating dates) are used to filter or sort results in both datasets.",
        "Specific numerical thresholds (e.g., rating scores >4, follower counts >100) are applied as filters in queries.",
        "Both datasets include queries about movie lists, including their size, followers, or update activity.",
        "Unique identifiers (e.g., user IDs, list IDs) are used to pinpoint specific entities in queries."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets include queries about movie titles based on specific criteria (e.g., IDs, release years).",
        "Both involve questions about numerical rating scores (e.g., counts, averages, highest values).",
        "Both reference user identifiers (e.g., user_id) to filter or retrieve data.",
        "Both require aggregations (e.g., counts, averages) for metrics like followers or ratings.",
        "Both ask about movie lists, including titles, followers, and creation/update metadata.",
        "Both use movie popularity as a metric in queries.",
        "Both include questions filtering movies by release years.",
        "Both reference numerical identifiers (e.g., movie IDs, list IDs) to target specific entities.",
        "Both involve ranking movies (e.g., \"highest rating,\" \"most popular\").",
        "Both include queries about user-generated content (e.g., ratings, lists, followers)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries involve filtering by user-specific identifiers (e.g., user IDs, payment status).",
        "Both datasets include questions about movie titles, lists, or ratings tied to explicit numerical IDs.",
        "Questions frequently require aggregating or counting results (e.g., 'how many users,' 'most followers').",
        "Queries reference conditional user states (e.g., 'paying subscriber,' 'trialist,' 'subscriber').",
        "Both include requests for temporal data (e.g., release years, rating dates, list update timestamps).",
        "Questions target relationships between movies and user-generated content (e.g., lists, ratings, followers).",
        "Queries often require joins across entities (e.g., linking users, movies, ratings, and lists).",
        "Both datasets ask for URL links to specific entities (e.g., movies, lists, ratings, director pages).",
        "Questions focus on extremal metrics (e.g., 'highest/lowest rating,' 'most followers,' 'most recent update').",
        "Queries include filtering by explicit rating scores (e.g., 'rating of 4,' 'score of 5')."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets involve queries to retrieve movie titles based on specific criteria like highest rating, popularity, or user subscriptions.",
        "Queries frequently filter results by user subscription status (e.g., trial vs. paying users).",
        "Aggregation functions (e.g., counts, averages, maximum values) are used to derive insights from ratings, popularity, or list metrics.",
        "Questions reference movie lists and their properties, such as the number of movies, followers, or update timestamps.",
        "Numerical thresholds (e.g., lists with >5 movies, users with >100 ratings) are used as conditions in queries.",
        "Temporal filters (e.g., release year, list update dates) are applied to narrow results.",
        "Rankings or top-N results (e.g., top 3 movies, highest-rated films) are explicitly requested.",
        "User-specific data is retrieved using identifiers like user IDs or usernames.",
        "Popularity metrics are central to determining results in both datasets.",
        "Conditions tie user actions (e.g., rating a movie, creating a list) to their subscription status (trial or paid)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets include queries about specific user IDs and their interactions (e.g., lists created, ratings given).",
        "Both require filtering movies by release year (e.g., 2021, 2020).",
        "Both involve counting or aggregating data (e.g., total movies rated, number of lists, average scores).",
        "Both focus on user-generated movie lists (e.g., list followers, update timestamps, list size).",
        "Both ask for rating scores (e.g., \"rating score of 4\", \"highest rating score\").",
        "Both reference user subscription status (e.g., trialists, paying subscribers, payment eligibility).",
        "Both include questions about popularity metrics (e.g., \"most popular\", \"highest rating\").",
        "Both use comparative thresholds (e.g., \"over 100 movies\", \"greater than 8\").",
        "Both require identifying movies with specific attributes (e.g., director, language, release year).",
        "Both involve retrieving metadata about entities (e.g., movie titles, user-created list descriptions)."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets require filtering apps based on specific categories or genres (e.g., 'arcade genre' in A, 'PERSONALIZATION' category in B).",
        "All questions involve numerical thresholds (e.g., sentiment > 0.5 in A, installs > 10,000 in B).",
        "Queries in both datasets request aggregation or calculation of metrics (e.g., average rating, count of installs).",
        "App ratings are a critical attribute for analysis in all samples (e.g., 'rating for Learn C++' in A, 'apps with rating \u2265 4.2' in B).",
        "Conditions on app price (free vs. paid) appear in all samples (e.g., 'free sports apps' in A, 'free apps in EDUCATION' in B).",
        "Top-ranked apps are requested using criteria like installs, ratings, or reviews (e.g., 'top 10 most reviewed apps' in A, 'top 5 in SPORTS' in B).",
        "App size is referenced as a filter or metric in all samples (e.g., 'size of Browser 4G' in A, 'average size of FAMILY apps' in B).",
        "All questions require retrieving app names or details (e.g., 'name the top 10 apps' in A, 'names of apps in finance' in B).",
        "Multiple criteria are combined in queries (e.g., 'genre + sentiment + content rating' in A, 'category + installs + rating' in B).",
        "User engagement metrics (installs, reviews) are analyzed across all samples (e.g., 'total installs' in A, 'most reviewed app' in B)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query numerical metrics related to app evaluations (sentiment scores in A, star ratings in B).",
        "Questions in both datasets filter results by app categories or genres.",
        "Aggregation functions (e.g., average, count) are frequently used to summarize data.",
        "Numerical thresholds (e.g., rating >4.0, sentiment >0.5) are applied as filters in queries.",
        "Top N rankings or comparisons (e.g., top 5 apps, highest-rated) are explicitly requested.",
        "Specific apps are referenced by name in queries across both datasets.",
        "Counts of apps, users, or reviews meeting criteria are calculated in both datasets.",
        "Queries combine multiple attributes (e.g., category + rating, genre + content rating).",
        "App metadata (e.g., installs, reviews, categories) is central to analysis in both datasets.",
        "Conditional logic (e.g., apps not updated since 2015, apps with undefined ratings) is used to refine results."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets include questions about app genres (e.g., action, puzzle) and categories (e.g., Tools, Education).",
        "Both use numerical thresholds for ratings (e.g., 4.5+ in B, 5.0 in A) to filter results.",
        "Both require aggregation of metrics like average sentiment polarity, average rating, or total installs.",
        "Both reference app install counts with specific thresholds (e.g., 1 million, 10 million) as criteria.",
        "Both combine multiple filters (e.g., genre + rating + sentiment score) in single queries.",
        "Both ask for top-ranked lists (e.g., top 3, top 5) based on installs, ratings, or reviews.",
        "Both include questions about free apps and their attributes (e.g., category, installs).",
        "Both use sentiment analysis metrics (e.g., polarity scores, subjectivity scores) as query parameters.",
        "Both filter apps based on content ratings or target audiences (e.g., Everyone 10+, adults only 18+).",
        "Both involve temporal filters like app update dates (e.g., updated since 2020 or not updated since 2015)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets involve queries about average ratings of apps in specific categories.",
        "Both require filtering data based on numerical thresholds (e.g., ratings >4, reviews >10,000).",
        "Both include questions about the count of apps meeting specific criteria (e.g., high ratings, sentiment conditions).",
        "Both reference app categories or genres as a key filtering parameter (e.g., 'Tools', 'Family', 'Games').",
        "Both use aggregation functions (e.g., AVG, COUNT) to derive insights from the data.",
        "Both ask for top-ranked lists (e.g., 'top 5 most reviewed apps').",
        "Both involve app metadata such as reviews, ratings, and installs as core attributes.",
        "Both include queries with compound conditions (e.g., category + rating thresholds).",
        "Both require comparisons of apps based on quantitative metrics (e.g., installs, reviews).",
        "Both focus on statistical summaries (e.g., average ratings, total counts) rather than raw data extraction."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets ask for average ratings of apps in specific categories (e.g., 'FAMILY', 'BUSINESS').",
        "Queries in both datasets frequently use aggregation functions like 'average' and 'count' to analyze metrics.",
        "Both include questions targeting app-specific metrics (e.g., ratings, reviews) by explicitly naming apps (e.g., 'Cooking Fever', 'HTC Weather').",
        "Questions in both datasets filter results based on app categories (e.g., 'arcade', 'DATING', 'COMMUNICATION').",
        "Both datasets involve counting user reviews (e.g., 'positive reviews', 'neutral attitude', 'number of reviews').",
        "Queries in both datasets seek to identify app popularity through metrics like installs or total reviews.",
        "Both include questions that combine multiple criteria (e.g., category + rating thresholds, free status + genre).",
        "Top-performing apps are a focus in both (e.g., 'highest rating', 'most reviewed', 'best selling').",
        "Both datasets use structured filters (e.g., 'rating > 4.0', 'sentiment review > 0.5') to narrow results.",
        "User feedback analysis (e.g., sentiment polarity in A, positive reviews in B) is central to queries in both datasets."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query average ratings of apps within specific categories (e.g., Family, Lifestyle, Games).",
        "Both involve filtering apps by content ratings (e.g., \"Everyone 10+\", \"Teen\", \"Mature 17+\").",
        "Questions in both datasets use numerical thresholds (e.g., sentiment > 0.5 in A, installs > 500 in B).",
        "Both include requests for aggregations like averages (e.g., average price, average rating).",
        "Both ask for app names matching specific criteria (e.g., apps with a 4.5 rating, apps in a category).",
        "Both reference free vs. paid app distinctions (e.g., \"free puzzle games\" in B, \"average price\" in A).",
        "Both datasets focus on app install counts (e.g., \"top 5 installed free apps\" in A, \"1,000,000 installs\" in B).",
        "Questions in both use category-based grouping (e.g., \"arcade genre\" in A, \"NEWS_AND_MAGAZINES\" in B).",
        "Both include top-N ranking queries (e.g., \"top 10 most reviewed apps\" in A, \"top-rated apps\" in B).",
        "Both combine multiple filters (e.g., genre + sentiment in A, category + installs in B)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets query average ratings across categories (e.g., A: 'average rating of comic category apps', B: 'average rating of Entertainment category').",
        "Both focus on filtering apps by genre or category (e.g., A: 'arcade genre', B: 'Action category').",
        "Both include numerical thresholds for ratings (e.g., A: 'apps with 5 rating', B: 'rating above 4.2').",
        "Both analyze app popularity via install counts (e.g., A: 'top 5 installed free apps', B: 'apps with >10,000 installs').",
        "Both use explicit content ratings as filters (e.g., A: 'Everyone 10+', B: suitability for teenagers).",
        "Both request ranked lists (e.g., A: 'top 10 most reviewed apps', B: 'top 5 most popular apps').",
        "Both reference specific app names for granular analysis (e.g., A: 'Garden Coloring Book', B: 'Twitter').",
        "Both calculate averages for metrics like price and sentiment (e.g., A: 'average price of dating apps', B: 'average rating of apps under $5').",
        "Both segment queries by user demographics (e.g., A: 'age group targeted', B: 'apps suitable for teenagers').",
        "Both filter by review volume (e.g., A: 'apps reviewed >75M times', B: 'apps with most reviews')."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets query app ratings (e.g., average rating, highest rating, ratings >4.0).",
        "Both filter results by app categories (e.g., 'Games', 'Tools', 'Business').",
        "Both use aggregation functions like COUNT (e.g., 'how many reviews') and AVG (e.g., 'average rating').",
        "Both include direct references to specific apps by name (e.g., 'Instagram', 'Facebook', 'Garden Coloring Book').",
        "Both focus on sentiment analysis in reviews (e.g., 'positive sentiment', 'negative sentiment').",
        "Both use numerical thresholds for filtering (e.g., 'rating >4.0', 'review count >75,000,000').",
        "Both ask for rankings (e.g., 'top 5 apps', 'highest rating').",
        "Both reference content ratings or maturity levels (e.g., 'Everyone 10+', 'Mature 17+').",
        "Both query metadata like install counts, review counts, and update dates.",
        "Both use conditional logic (e.g., 'apps that have not been updated since 2015', 'rating greater than 4')."
      ]
    }
  },
  "diffs_synth_from_real": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about professors' years in the program as a numerical filter (e.g., 'more than 2 years'), while A does not reference professor experience duration as a direct condition.",
        "Dataset B explicitly asks for student phases (e.g., 'Phase Pre_Quals', 'Phase 0'), whereas A focuses solely on years in program without phase distinctions.",
        "Dataset B references specific faculty/department names (e.g., 'Faculty of Mathematics'), while A uses generic terms like 'faculty' without naming departments.",
        "Dataset B contains queries about program completion status (e.g., 'students who have not completed their program yet'), which A does not address.",
        "Dataset B uses explicit numerical range comparisons for years (e.g., 'minimum and maximum number of years in program'), while A uses ordinal terms like 'eighth year' or '5th year'.",
        "Dataset B includes requests for course names (e.g., 'names of the courses taught'), whereas A exclusively references course IDs and levels.",
        "Dataset B asks for combined entity totals (e.g., 'total number of professors and students'), while A focuses on counts of single entity types.",
        "Dataset B queries about courses not taught by specific professors (e.g., 'courses not taught by professor 240'), whereas A uses negation only in broader categorical filters.",
        "Dataset B includes phase-year combinations (e.g., 'students in Phase 0 are in their first year'), while A treats years and statuses as separate attributes.",
        "Dataset B aggregates student counts per course level (e.g., 'total number of students in each level'), whereas A aggregates counts without explicit per-level grouping."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B uses explicit course level labels (e.g., 'Level_500'), while A uses descriptive classifications like 'basic or medium undergraduate'.",
        "Dataset B includes queries about student/professor names (e.g., 'name of the person'), whereas A only references numeric IDs.",
        "Dataset B explicitly asks about individuals with dual roles (e.g., 'both a student and a professor'), which A does not address.",
        "Dataset B queries singular professor IDs (e.g., 'professor 335'), while A frequently uses ID ranges (e.g., 'person IDs from 40 to 50').",
        "Dataset B omits references to faculty employment status terms like 'faculty member' or 'position status' present in A.",
        "Dataset B includes standalone aggregate counts (e.g., 'total number of students in the program'), while A ties counts to specific conditions (e.g., 'taught by a faculty member').",
        "Dataset B uses the attribute 'yearsInProgram' for student timelines, whereas A refers to ordinal years (e.g., '5th year of their program').",
        "Dataset B lacks compound filtering (e.g., combining course level and faculty status) seen in A's queries.",
        "Dataset B repeats structurally identical questions (e.g., 'What courses are taught by professors?'), while A's queries are more syntactically varied.",
        "Dataset B includes extremal queries about explicit database-wide attributes (e.g., 'highest course level in the database'), while A focuses on extremal counts tied to professors or courses."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes queries about professors' years of experience (e.g., '5 or more years'), while A does not reference experience duration.",
        "Dataset B explicitly references academic phases (e.g., 'Phase 1', 'master/graduate phase') as standalone filters, whereas A primarily uses numerical years in program.",
        "Dataset B uses negation in conditions (e.g., 'students [...] have not been advised by any professors'), which is absent in A's samples.",
        "Dataset B requires counting professors per course (e.g., 'taught by at least two professors'), while A counts courses per professor.",
        "Dataset B includes course names (e.g., 'Data Structures and Algorithms') as attributes, unlike A's exclusive focus on IDs and levels.",
        "Dataset B contains aggregate calculations like averages (e.g., 'average number of students per professor'), which A's samples lack.",
        "Dataset B references specific teaching phases unrelated to student progress (e.g., 'professors not in their program's first phase'), unlike A's student-year-based filters.",
        "Dataset B queries multi-advisor scenarios (e.g., 'students have more than one advisor') as standalone conditions, while A only ties advisor counts to student academic years.",
        "Dataset B filters professors by positional qualifiers like 'full-time' and 'position other than Faculty_eme', whereas A focuses only on faculty membership status.",
        "Dataset B explicitly uses hyphenated level names (e.g., 'Level_400'), while A uses descriptive terms like 'high-level' without numeric labels."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B uses numeric course levels (e.g., 500) or named tiers (e.g., 'Level_100') instead of qualitative terms like 'basic/medium/high-level' used in A",
        "Dataset B includes queries about program-wide totals (e.g., 'total number of students') without faculty affiliation filters that are consistently present in A",
        "Dataset B contains explicit references to intermediate/graduate levels as distinct categories rather than A's undergraduate/professional/master groupings",
        "Dataset B queries course popularity through enrollment numbers ('students enrolled in each course') while A focuses purely on teaching assignments",
        "Dataset B uses simple numeric thresholds for program tenure ('>5 years') rather than A's positional faculty status checks",
        "Dataset B includes duplicate/repeated questions about program enrollment counts that don't appear in A",
        "Dataset B lacks A's complex conditional thresholds (e.g., 'no more than two') in course count comparisons",
        "Dataset B omits A's multi-condition joins between professor attributes and student advising relationships",
        "Dataset B doesn't request extreme values (most/least courses taught) that appear frequently in A's aggregation queries",
        "Dataset B uses simpler filtering patterns (single WHERE clauses) compared to A's nested conditions combining course levels, faculty status, and ID ranges"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries explicitly reference professor/student names (e.g., 'Professor Jane') while A only uses numerical identifiers",
        "B includes direct requests for professor/student names ('What is the name of...') whereas A never asks for textual names",
        "B contains queries about hybrid professor-student roles ('professor who is also a student') not present in A's samples",
        "B's questions focus on singular record retrieval ('course with course_id 27') while A emphasizes aggregated counts and comparative queries ('professor taught the least amount')",
        "B uses explicit 'course_id'/'p_id' field labels in queries while A uses generic terms like 'course no.' or 'teacher no.'",
        "B asks about student/professor dual-status verification ('person who is student and has no position') while A maintains clear role separation",
        "B's temporal filters use simple duration comparisons ('more than 5 years') without A's complex positional logic ('8th year with position status')",
        "B lacks A's pattern of combining faculty status with course level filters in single queries",
        "B shows consistent use of 'person with id' phrasing suggesting a unified person table, unlike A's distinct professor/student entities",
        "B omits A's frequent range-based queries (e.g., 'person IDs from 40 to 50') and ranking requests ('teaching the most courses')"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B exclusively uses standardized course level labels (e.g., 'Level_400', 'Level_500') while A uses relative descriptors ('basic', 'medium', 'high-level')",
        "Queries in B frequently filter by professors' duration in program using comparison operators (e.g., '>5 years') while A uses absolute year values without comparisons",
        "B contains explicit negation patterns (e.g., 'students have not been advised') not found in A's samples",
        "B uses multiple ID field names (p_id, professor_id, person ID) inconsistently while A maintains consistent 'ID' terminology",
        "B lacks explicit references to faculty membership status filters present in most A queries",
        "B includes direct superlative queries about course levels (e.g., 'highest level') while A's superlatives focus on quantity of courses taught",
        "B queries show frequent use of quotation marks around identifiers (e.g., 'Level_400', '27') unlike A",
        "B lacks numerical range filters (e.g., 'IDs from 40 to 50') present in A's queries",
        "B contains no examples combining multiple categorical filters (e.g., 'basic AND faculty-taught') common in A",
        "B queries focus more on direct entity relationships while A emphasizes quantitative comparisons ('no more than two', 'most/least')"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries often specify professors' years in the program as a threshold (e.g., '\u22655 years'), whereas Dataset A focuses more on students' years in program thresholds.",
        "Dataset B includes direct references to professor names (e.g., 'professor John'), while Dataset A exclusively uses numeric identifiers (e.g., 'teacher no.79').",
        "Dataset B questions frequently combine course attributes with advising roles (e.g., 'advises students in a course with a level of advanced'), whereas Dataset A treats these as separate filters.",
        "Dataset B explicitly references 'students in their 2nd year of the program' as instructors, while Dataset A only associates students with advising roles, not teaching roles.",
        "Dataset B uses parameterized phrasing like 'a certain courseLevel' or 'a person of a certain level' in questions, which Dataset A avoids in favor of explicit values.",
        "Dataset B repeats nearly identical questions about professors with '>5 years in the program' across multiple samples, while Dataset A maintains more query diversity.",
        "Dataset B includes direct requests for professor names (e.g., 'name of the professor'), whereas Dataset A only requests identifiers and position statuses.",
        "Dataset B uses 'p_id' notation for professor identifiers, while Dataset A uses varied formats like 'professor ID', 'teacher no.', and 'person IDs'.",
        "Dataset B contains questions about professors teaching 'more than one course' without aggregation terms like 'most/least', which Dataset A explicitly uses with comparative operators.",
        "Dataset B includes circular logic in some queries (e.g., 'courses taught by professors who have taught more than one course'), while Dataset A maintains straightforward conditional relationships."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries focus on single-attribute filtering (e.g., 'Phase 3 students') rather than combined attribute filters like 'basic or medium undergraduate' in A",
        "Dataset B contains explicit requests for SQL syntax specification (e.g., 'Write a SQLite query') while A uses natural language formulations",
        "Dataset B includes queries about database structure itself (e.g., 'course levels available in the database') that A doesn't explicitly ask",
        "Dataset B shows repetitive phrasing of similar questions about professor-course relationships without varied conditions seen in A",
        "Dataset B queries use simpler table/column references (e.g., 'inPhase' column) rather than A's conceptual attributes like 'faculty member'",
        "Dataset B lacks queries combining student attributes with advisor relationships that are prominent in A's samples",
        "Dataset B includes direct ID matching queries (e.g., 'professor with id 5') without requiring positional ranges (40-50) like A",
        "Dataset B omits temporal conditions related to employment duration ('more than 5 years') that A frequently uses",
        "Dataset B contains simpler counting requests without comparative operators ('no more than two') seen in A",
        "Dataset B lacks queries about extremal values ('most courses', 'least courses') that are characteristic of A's questions"
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B focuses more on list attributes (e.g., list IDs, follower counts, titles with specific patterns like containing 's') while Dataset A prioritizes URLs/images (e.g., cover images, rating URLs, movie URLs).",
        "Dataset B explicitly includes exact timestamp constraints (e.g., 'before 2012-11-13 00:00:00 UTC'), whereas Dataset A uses broader time ranges (e.g., years or date intervals).",
        "Dataset B frequently uses 'average' aggregations for ratings (e.g., average rating per director/list/user group), while Dataset A focuses more on counts/percentages of rated movies or user actions.",
        "Dataset B contains questions about list titles with numeric patterns (e.g., '2010', '2021') as standalone identifiers, which is absent in Dataset A.",
        "Dataset B explicitly references payment methods (e.g., 'users who have a payment method') as a filter, while Dataset A focuses solely on subscription status (trialist/paying).",
        "Dataset B includes questions about list/movie popularity within specific user-created lists (e.g., 'WW2 list'), whereas Dataset A queries popularity across all users or directors.",
        "Dataset B uses numerical qualifiers like 'top 3' in rankings more frequently, while Dataset A generally asks for single top entries or ties.",
        "Dataset B explicitly mentions combined user statuses (e.g., 'both trialist and subscriber'), while Dataset A treats trialist/paying as mutually exclusive states.",
        "Dataset B includes percentage calculations tied to payment methods (e.g., '% of users with payment methods'), unlike Dataset A’s percentages based on release years or subscription status.",
        "Dataset B references the 'lists' table directly in questions (e.g., 'from 'lists' table'), while Dataset A implicitly joins entities without naming tables."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Queries in dataset B do not reference user subscription statuses (e.g., 'trialist' or 'paying subscriber') unlike dataset A.",
        "Dataset B questions lack percentage-based calculations (e.g., 'percentage of rated movies') present in dataset A.",
        "Dataset B does not include requests for cover images or user profile images, unlike dataset A.",
        "Queries in dataset B avoid multi-step conditional logic (e.g., 'users eligible for trial when rating') seen in dataset A.",
        "Dataset B questions do not ask for comparisons between entities (e.g., 'how many more lists') found in dataset A.",
        "Dataset B lacks references to user interactions like 'comments' or 'likes on ratings' present in dataset A.",
        "Queries in dataset B do not combine temporal filters with user statuses (e.g., 'created when a paying subscriber') as in dataset A.",
        "Dataset B does not require filtering results by follower ranges (e.g., 'followers between 1-2') unlike dataset A.",
        "Dataset B questions omit descriptions of movie lists (e.g., 'description of the list') included in dataset A.",
        "Queries in dataset B focus on aggregate totals (e.g., 'total number of followers') without layered constraints common in dataset A."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B queries focus on ranking and top-N results (e.g., 'top 3', 'highest number') while A focuses on specific counts/percentages without rankings",
        "Dataset B explicitly requests average ratings (e.g., 'average rating score') while A focuses on raw counts of specific scores",
        "Dataset B contains queries about text pattern matching in list titles (e.g., 'contains the word 'Avengers'') not seen in A",
        "Dataset B references specific movie/list quality thresholds (>8 ratings) while A uses 1-5 rating scale exclusively",
        "Dataset B asks about list metadata (list_id) directly while A focuses on list creator/user context",
        "Dataset B includes queries about payment method status ('users who have a payment method') while A only references subscription/trial status",
        "Dataset B contains more complex inter-list metrics ('average number of movies in lists that also have...') while A focuses on single-list statistics",
        "Dataset B uses explicit movie IDs (e.g., 'movie with ID 1000') for identification where A primarily uses titles",
        "Dataset B queries aggregate across multiple lists/users ('movies added to lists of subscribers') while A typically focuses on individual user/list contexts",
        "Dataset B includes queries about temporal recency ('updated in the last month') with relative timeframes where A uses absolute timestamps"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries avoid referencing specific user/list IDs (e.g., 'user 39115684') present in Dataset A, using general terms like 'specific user' instead.",
        "Dataset B includes questions about movie genres (e.g., 'How many different genres of movies are there?'), which are absent in Dataset A.",
        "Dataset B omits references to URLs, cover images, or external links (e.g., 'What's the cover image...?') frequently requested in Dataset A.",
        "Dataset B excludes user subscription status filters (e.g., 'paying subscriber,' 'trialist') commonly used in Dataset A queries.",
        "Dataset B uses explicit 'top N' ranking formats (e.g., 'top 5 most popular movies'), whereas Dataset A uses open-ended superlatives (e.g., 'most popular').",
        "Dataset B queries focus on singular metrics (e.g., 'average rating score') without combining multiple criteria, unlike Dataset A's multi-condition questions (e.g., director + year + popularity).",
        "Dataset B emphasizes database-wide aggregates (e.g., 'How many movies are there in the database?'), while Dataset A focuses on user-specific interactions (e.g., 'How many users gave... a rating score of 4?').",
        "Dataset B employs uniform numerical thresholds (e.g., 'rating score >8') without contextual qualifiers like follower counts or user statuses seen in Dataset A.",
        "Dataset B omits temporal conditions tied to user activity (e.g., 'created when he was a paying subscriber') prevalent in Dataset A.",
        "Dataset B lacks percentage-based calculations (e.g., 'percentage of rated movies released in 2021') present in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in dataset B focus on retrieving single attributes (e.g., title, count) without requiring multiple attributes per result, unlike A which often lists multiple fields (e.g., title, date, score).",
        "Dataset B does not include queries about URLs, cover images, or descriptions of entities (e.g., movies, lists, users), whereas A frequently requests these.",
        "Dataset B lacks questions involving user subscription status (e.g., trialist, paying subscriber) during actions like list creation or rating, which are common in A.",
        "Queries in B do not require percentage calculations (e.g., \"percentage of rated movies\"), unlike A which includes such aggregations.",
        "Dataset B omits temporal filters with date ranges (e.g., \"between 1/1/2017 to 12/31/2017\"), using only simple date comparisons (e.g., \"before 2014\").",
        "Dataset A includes multi-step analytical queries (e.g., combining director, release year, and popularity), while B relies on single-criterion filters (e.g., highest rating).",
        "B does not reference user-generated content interactions beyond ratings (e.g., comments, likes), whereas A asks about comments, likes, and critic-related data.",
        "Queries in B are repetitive and formulaic (e.g., multiple variations of \"highest rating score\"), while A exhibits greater diversity in phrasing and intent.",
        "Dataset B does not involve subqueries or joins (e.g., finding directors with the most movies and their ratings), which are present in A.",
        "B avoids questions requiring conditional aggregations (e.g., \"how many users who were eligible for trial when they rated\"), unlike A."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries frequently request movie/list titles by direct ID lookup (e.g., 'movie with id 152761') while A uses contextual references",
        "Dataset B contains explicit requests for average calculations (e.g., 'average rating score') absent in A",
        "Dataset A queries require percentage calculations (e.g., 'percentage of rated movies') not seen in B",
        "Dataset B includes genre/thematic filters (e.g., 'horror movies', 'Christianity theme') while A does not",
        "Dataset A explicitly requests image URLs (cover images, user images) while B never does",
        "Dataset B uses relative time filters ('last year') while A specifies exact date ranges",
        "Dataset A contains multi-part questions requiring combined answers (e.g., 'when... and what is') that B avoids",
        "Dataset B references specific database tables ('ratings table', 'lists table') in questions while A does not",
        "Dataset A queries filter by comment metrics (likes, critic comments) that B never mentions",
        "Dataset B uses 'top N' ordinal patterns (e.g., 'top 5 movies') while A focuses on singular extremal metrics"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries explicitly reference database table names (e.g., 'lists_users table', 'Ratings table') while A uses abstract entity references",
        "B's questions frequently repeat identical query patterns (e.g., multiple instances of 'movie with the highest rating') while A maintains more diverse phrasing",
        "B contains direct requests for movie titles without additional metadata (e.g., URLs, images, descriptions) that A frequently requires",
        "B uses simpler aggregation conditions (e.g., 'most followers') while A employs complex comparative aggregations (e.g., 'most number of movies')",
        "B's temporal filters use simple date comparisons (> 2021-01-01) while A uses specific date ranges (1/1/2017 to 12/31/2017)",
        "B lacks queries about user rating history/context (e.g., 'when he/she was a paying subscriber') that A consistently includes",
        "B's questions never request percentage calculations or ratio-based metrics that appear in A's samples",
        "B includes genre-specific filters ('Horror' genre average) while A's queries never mention genre criteria",
        "B's list-related queries focus exclusively on list properties while A connects lists to creator subscription status history",
        "B uses simpler numerical thresholds (>=5 followers) while A employs complex range-based thresholds (followers between 1-2)"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B focuses on aggregate metrics (e.g., 'average rating score', 'total number of movies rated') without requiring contextual user action timestamps (e.g., subscription status during specific actions).",
        "Dataset B lacks queries about URLs (e.g., cover images, rating URLs, list URLs) present in all Dataset A questions.",
        "Dataset B does not include percentage calculations (e.g., 'percentage of rated movies') seen in Dataset A.",
        "Dataset B omits temporal ranges (e.g., 'between 1/1/2017 to 12/31/2017') and uses only year-based filters.",
        "Dataset B avoids multi-part questions (e.g., combining release dates with user IDs) seen in Dataset A.",
        "Dataset B excludes platform-specific metadata (e.g., 'image URL to the movie on Mubi') required in Dataset A.",
        "Dataset B does not reference user-generated interactions beyond ratings/lists (e.g., comments, likes, list followers).",
        "Dataset B lacks conditional subscription-status checks tied to specific actions (e.g., 'when he created the list').",
        "Dataset B omits questions requiring uniqueness identification (e.g., 'most recently updated list') in favor of thresholds.",
        "Dataset B avoids critic-related data (e.g., 'links to ratings with a critic') present in Dataset A."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B does not reference sentiment analysis attributes (e.g., sentiment polarity, subjectivity, or user attitudes like 'positive' or 'negative') present in all Dataset A queries.",
        "Dataset B does not include content ratings tied to specific age groups (e.g., 'Everyone 10+' or 'Mature 17+') as seen in Dataset A.",
        "Dataset B does not mention translated reviews or require retrieval of user comments (e.g., 'translated review if available'), unlike Dataset A.",
        "Dataset B does not query app update timelines (e.g., 'not been updated since 2015'), a recurring condition in Dataset A.",
        "Dataset B uses explicit numerical thresholds (e.g., '10,000 installs') without qualitative modifiers (e.g., 'pretty much'), which appear in Dataset A.",
        "Dataset B groups apps by standardized category labels (e.g., 'PERSONALIZATION'), while Dataset A uses genre-based classifications (e.g., 'arcade genre').",
        "Dataset B calculates percentages based on install/rating criteria (e.g., 'percentage of games with 100,000+ installs'), whereas Dataset A focuses on sentiment percentages (e.g., 'percentage of negative sentiments').",
        "Dataset B aggregates metrics at the category level (e.g., 'average size of FAMILY apps'), while Dataset A aggregates at the app or genre level (e.g., 'average sentiment polarity of Cooking Fever').",
        "Dataset B does not reference user sentiment attitudes (e.g., 'neutral attitude' or 'positive favorability') present in all Dataset A samples.",
        "Dataset B specifies app size thresholds with explicit units (e.g., 'size < 50M'), while Dataset A mentions size without units (e.g., 'size of Browser 4G')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B exclusively references star ratings (e.g., >4.0, 5.0) as numerical metrics, while Dataset A uses sentiment scores (e.g., polarity >0.5) and subjective user attitudes (neutral, positive).",
        "Dataset B includes queries about apps with 'None' ratings (e.g., \"rating of 'None'\"), whereas Dataset A filters apps with undefined ratings via sentiment conditions (e.g., \"apps not updated since 2015\").",
        "Dataset A explicitly requests translated reviews and user comments (e.g., \"translated review if available\"), while Dataset B does not reference translated content.",
        "Dataset B uses uppercase, standardized category names (e.g., 'GAME', 'SOCIAL'), while Dataset A employs lowercase genres (e.g., \"arcade genre\") and age-based content ratings (e.g., \"Mature 17+\").",
        "Dataset A incorporates user demographic filters (e.g., \"age group targeted\"), whereas Dataset B queries lack demographic criteria.",
        "Dataset B explicitly references the Google Play Store as the data source (e.g., \"apps on the Google Play Store\"), while Dataset A does not specify a platform.",
        "Dataset A combines price, app size, and install counts with sentiment or ratings (e.g., \"average price of games\"), whereas Dataset B focuses solely on ratings and review counts.",
        "Dataset A calculates sentiment percentages (e.g., \"percentage of positive sentiments\"), while Dataset B computes total review counts or averages (e.g., \"average review count\").",
        "Dataset B frequently pairs category filters with review count thresholds (e.g., \">10,000 reviews\"), while Dataset A combines categories with sentiment scores and content ratings (e.g., \"arcade genre + Everyone 10+\").",
        "Dataset A queries often request multiple distinct metrics per question (e.g., \"average rating + user count\"), while Dataset B focuses on single metrics or paired aggregates (e.g., \"average rating + max reviews\")."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B does not include questions about translated user reviews, whereas Dataset A frequently requests translated reviews.",
        "Dataset B does not reference specific app names in quotes within its questions, unlike Dataset A.",
        "Dataset B uses install count thresholds (e.g., 1,000 installs) as standalone criteria, while Dataset A combines install counts with sentiment percentages.",
        "Dataset B focuses on broader app categories without specifying niche content ratings (e.g., Mature 17+), whereas Dataset A includes detailed content rating filters.",
        "Dataset B does not ask for the worst-rated apps, while Dataset A includes queries about lowest-rated applications.",
        "Dataset B lacks questions about app size or pricing metrics, which are present in Dataset A.",
        "Dataset B emphasizes popularity metrics (e.g., \"most popular,\" \"top by installs\") more prominently than Dataset A, which uses terms like \"most reviewed\" or \"best selling.\"",
        "Dataset B does not request percentages of sentiment polarity (e.g., percentage of positive sentiments), whereas Dataset A frequently asks for such calculations.",
        "Dataset B's temporal filters focus on apps updated since a specific year, while Dataset A also asks about apps not updated since a year and links this to sentiment analysis.",
        "Dataset B's questions often structure results by grouping or ranking within categories, whereas Dataset A combines multiple metrics (e.g., installs + sentiment) in single queries without explicit grouping."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B does not include questions related to sentiment polarity, subjectivity, or user attitudes (e.g., 'positive sentiment', 'negative sentiment') present in all Dataset A queries.",
        "Dataset B does not reference specific app names (e.g., 'Cooking Fever', 'Brit + Co') in any queries, while Dataset A frequently does.",
        "Dataset B excludes questions about install counts/amounts that are common in Dataset A queries (e.g., 'total installs', 'how many installs').",
        "Dataset B never mentions translated reviews or comment analysis present in multiple Dataset A samples.",
        "Dataset B queries lack references to content rating categories (e.g., 'Everyone 10+', 'Mature 17+') that appear in Dataset A.",
        "Dataset B does not combine sentiment conditions with other filters (e.g., 'rating >4 AND sentiment >0.5') like Dataset A does.",
        "Dataset B excludes percentage-based calculations (e.g., 'percentage of negative sentiment') found in Dataset A queries.",
        "Dataset B does not include temporal filters (e.g., 'not updated since 2015') present in Dataset A questions.",
        "Dataset B never requests both quantitative metrics and textual data (e.g., 'average rating + reviews') in the same question, unlike Dataset A.",
        "Dataset B contains explicit SQL instruction prompts (e.g., 'Write a SQL query') absent from Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries do not mention sentiment polarity scores (e.g., 'sentiment polarity > 0.5') present in all Dataset A samples",
        "Dataset B questions never reference age groups or content rating tiers (e.g., 'Everyone 10+', 'Mature 17+') that appear in Dataset A",
        "Dataset B lacks any mention of translated reviews or multilingual content analysis present in Dataset A queries",
        "Dataset B queries never combine rating analysis with temporal filters (e.g., 'not updated since 2015') like Dataset A",
        "Dataset B contains no questions about app prices or free/paid status filters that appear in Dataset A samples",
        "Dataset B never requests percentage calculations (e.g., 'percentage of positive sentiments') featured in Dataset A queries",
        "Dataset B shows no interest in app update dates or version history present in Dataset A's temporal filters",
        "Dataset B queries never ask for combined metric comparisons (e.g., 'rating + installs + sentiment') seen in Dataset A's multi-part questions",
        "Dataset B lacks any reference to sentiment subjectivity scores that appear in Dataset A's Photography app query",
        "Dataset B never requests app-specific technical attributes like size (e.g., 'size of Browser 4G') that Dataset A includes"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries do not reference sentiment-related metrics (e.g., sentiment polarity, subjectivity, or user attitudes like 'positive'/'neutral') present in all Dataset A samples.",
        "Dataset B lacks references to translated reviews or multilingual content analysis present in Dataset A queries.",
        "Dataset B excludes time-based filters (e.g., 'not updated since 2015') found in Dataset A questions.",
        "Dataset B never combines rating analysis with user sentiment percentages (e.g., 'percentage of positive sentiments') like Dataset A consistently does.",
        "Dataset B queries focus exclusively on Play Store apps ('Play Store'/'playstore' mentioned in all samples), while Dataset A questions lack platform specificity.",
        "Dataset B thresholds exclusively use install counts (e.g., '>500 installs') rather than Dataset A's sentiment scores (e.g., 'sentiment > 0.5') or review counts.",
        "Dataset B queries never request technical app attributes (e.g., app size, update status) that appear in Dataset A questions.",
        "Dataset B lacks compound percentage calculations (e.g., 'percentage having more negative sentiment') present in Dataset A queries.",
        "Dataset B questions never combine rating analysis with review text analysis (e.g., 'reviews with comments') featured in Dataset A.",
        "Dataset B uses standardized category naming conventions (e.g., 'AUTO_AND_VEHICLES') unlike Dataset A's mixed formats (e.g., 'arcade genre')"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries explicitly reference the 'PlayStore' or 'Play Store' platform, while Dataset A does not mention a specific platform.",
        "Dataset A includes questions about sentiment polarity scores, sentiment subjectivity scores, and user attitudes (positive/neutral/negative), which are absent in Dataset B.",
        "Dataset A requests translated reviews or comments (e.g., 'List all free sports Apps and their translated review'), whereas Dataset B does not involve review text analysis.",
        "Dataset A contains queries about app update timelines (e.g., 'apps not updated since 2015'), while Dataset B lacks temporal filters related to app maintenance.",
        "Dataset A explicitly references granular age-based content ratings (e.g., 'Everyone 10+', 'Mature 17+'), while Dataset B uses broader demographic filters like 'suitable for teenagers'.",
        "Dataset A combines multiple metrics in single queries (e.g., average rating + user sentiment count), whereas Dataset B questions focus on isolated metrics like average rating or install counts.",
        "Dataset A includes app size as a query parameter (e.g., 'size of Browser 4G'), which is absent in Dataset B.",
        "Dataset A calculates percentages of sentiment distribution (e.g., 'percentage of positive sentiments'), while Dataset B focuses solely on numerical thresholds without proportional sentiment breakdowns.",
        "Dataset A asks for sentiment subjectivity scores (e.g., 'total Sentiment subjectivity score'), a metric not referenced in Dataset B.",
        "Dataset A explicitly requests raw review content (e.g., 'List all the comments'), whereas Dataset B does not involve direct review text retrieval."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries do not mention translated reviews, whereas Dataset A frequently asks for translated reviews (e.g., 'translated review', 'state the translated review').",
        "Dataset B does not reference app size (e.g., 'size of Browser 4G') or price (e.g., 'average price'), unlike Dataset A.",
        "Dataset B does not include sentiment polarity scores (e.g., 'sentiment polarity score') or sentiment subjectivity scores, whereas Dataset A explicitly queries these metrics.",
        "Dataset B queries lack percentage-based sentiment analysis (e.g., 'percentage of positive sentiments'), which are common in Dataset A.",
        "Dataset B does not combine sentiment analysis with install counts or metadata in single queries (e.g., 'installs amount of these apps and percentage of positive sentiments'), unlike Dataset A.",
        "Dataset B does not filter by specific age groups (e.g., 'age group targeted') or maturity levels beyond general content ratings, unlike Dataset A.",
        "Dataset B queries are simpler and repetitive (e.g., multiple variations of 'average rating of Games category'), while Dataset A queries are more complex with layered conditions.",
        "Dataset B does not reference app update dates (e.g., 'apps not updated since 2015') in conjunction with sentiment analysis, unlike Dataset A.",
        "Dataset B does not ask for 'worst rating' or 'lowest rated' apps combined with sentiment counts, unlike Dataset A.",
        "Dataset B does not include queries about app genres (e.g., 'arcade genre'), focusing only on categories, unlike Dataset A."
      ]
    }
  },
  "diffs_real_from_synth": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B explicitly requests listing both person IDs and categorical attributes (e.g., course levels, position statuses) together in output pairs, whereas A typically requests these separately.",
        "Dataset B includes queries about combined course categories (e.g., 'basic **or** medium undergraduate courses'), while A focuses on single explicit course levels like 'Level_500'.",
        "Dataset B contains queries about faculty membership status ('affiliated professors in faculty') as a **filter condition** for counting/listing, while A uses faculty affiliation primarily as a grouping attribute.",
        "Dataset B specifically references numerical ranges for IDs (e.g., 'person IDs from 40 to 50'), a pattern not seen in A's samples.",
        "Dataset B asks for identification of professors teaching 'high-level or harder undergraduate courses' \u2013 a compound difficulty/level filter not present in A's simpler level-based queries.",
        "Dataset B includes explicit requests for **minimum** aggregates ('professor taught the least amount of courses'), whereas A focuses on maximums/totals.",
        "Dataset B combines student years in program with advisor relationships (e.g., 'students with eighth year of program and position status'), while A links years to phases/statuses without advisor context.",
        "Dataset B uses 'person IDs' as a universal identifier across entities, while A maintains separate 'professor ID' and 'student ID' distinctions.",
        "Dataset B explicitly asks for employment verification patterns ('Is the teacher... a faculty member?'), whereas A focuses on employment status as a counting filter.",
        "Dataset B includes structured output formatting requirements (e.g., 'List... and state...') with multiple concurrent data points, while A requests singular outputs more frequently."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B queries often request composite attribute listings (e.g., 'person IDs and course levels') while A typically requests single attributes",
        "B includes explicit positional status filters (e.g., 'position status in faculty') not seen in A's simpler faculty membership checks",
        "B contains multi-clause numerical constraints in aggregations (e.g., 'no more than two high-level courses') while A uses simple counts",
        "B uses dual-entity conditional relationships (e.g., 'students with eighth year of program AND professor position status') where A focuses on single relationships",
        "B employs varied terminology for educators ('teacher' vs 'professor') while A consistently uses 'professor'",
        "B specifies program year granularity in student filters (e.g., '5th year', '8th year') absent in A's queries",
        "B requires combined extremal-aggregate outputs (e.g., 'most courses AND general course level') where A focuses purely on extremal values",
        "B includes hierarchical qualification chains (e.g., 'professors teaching X courses who advise Y students') unlike A's direct relationships",
        "B explicitly queries course type categories (e.g., 'professional/master') while A uses basic level descriptors",
        "B employs imperative instruction verbs ('Provide', 'Describe', 'Mention') requiring structured outputs where A uses simple interrogatives"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B uses 'teacher' and 'member of the faculty' interchangeably with 'professor,' while A consistently uses 'professor.'",
        "B includes yes/no existence checks (e.g., 'Is the teacher...'), whereas A focuses on counts, lists, or comparisons.",
        "B explicitly references specific course numbers (e.g., 'course no.9,' 'course 165'), while A uses numerical ranges or inequalities for IDs.",
        "B combines course level filters with logical operators (e.g., 'basic or medium,' 'high-level or harder'), whereas A typically applies single-level filters.",
        "B requests combined attribute pairs (e.g., 'position status and IDs,' 'person IDs and course levels') in results, while A often lists attributes separately.",
        "B introduces 'professional courses' as a distinct category, absent in A, which focuses on undergraduate/master/graduate levels.",
        "B specifies student academic progress via exact year numbers (e.g., '5th year,' 'eighth year'), while A uses phases (e.g., 'Phase 1,' 'master/graduate phase').",
        "B includes requests to 'describe' or 'mention' attributes alongside listing them, a phrasing not found in A.",
        "B uses the term 'employees' (e.g., 'faculty employees') to describe faculty roles, whereas A uses 'faculty affiliated position' or 'position of Faculty.'",
        "B combines total counts with explicit ID listings in the same query (e.g., 'total of professional courses... list course id'), while A separates these into distinct queries."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries frequently combine multiple attributes in the output (e.g., 'person IDs and course levels', 'position status and IDs') while A focuses on single attributes like counts or course IDs alone.",
        "Dataset B includes numerical constraints in course/professor filters (e.g., 'no more than two high-level courses') whereas A uses categorical or threshold-based filters (e.g., '500 or higher').",
        "Dataset B explicitly references positional faculty status (e.g., 'position status in faculty', 'member of faculty') in filtering or outputs, while A primarily checks general faculty membership without positional details.",
        "Dataset B asks for comparative aggregations (e.g., 'professor taught the least amount of courses', 'teaching the most courses') whereas A focuses on absolute counts without ranking extremes.",
        "Dataset B includes explicit yes/no verification queries (e.g., 'Is the teacher... a faculty member?') which are absent in A.",
        "Dataset B uses ID ranges in filters (e.g., 'person IDs from 40 to 50') while A queries single IDs (e.g., 'professor with ID 100').",
        "Dataset B ties advising relationships to faculty status or positional details (e.g., 'advisor IDs... and position status in faculty') whereas A treats advising as a standalone relationship.",
        "Dataset B explicitly links course difficulty tiers (e.g., 'high-level or harder undergraduate courses') to counts or filters, while A uses standalone level categories (e.g., 'graduate level').",
        "Dataset B includes multi-condition student filters (e.g., 'students with eighth year of program and position status in faculty') whereas A filters students by phase alone (e.g., 'phase 2').",
        "Dataset B requests combined descriptive outputs (e.g., 'course level and list of person IDs') in single queries, while A separates these into distinct queries (e.g., course level vs. course IDs)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries frequently require returning composite results (e.g., multiple columns like person IDs + course levels) rather than single values",
        "Dataset B contains explicit range queries (e.g., 'person IDs from 40 to 50') while A uses only specific identifier lookups",
        "Dataset B includes aggregation with compound conditions (e.g., 'no more than two high-level... courses') in count operations",
        "Dataset B explicitly asks for boolean verification queries (e.g., 'Is the teacher... a faculty member?') using yes/no patterns",
        "Dataset B references academic levels (undergraduate/graduate) in conjunction with course levels, which A never combines",
        "Dataset B contains extremum queries (e.g., 'teaching the most courses', 'least amount of courses') using superlatives",
        "Dataset B requires combined faculty status + teaching output in results (e.g., 'position status AND course levels')",
        "Dataset B uses explicit categorical grouping in counts (e.g., 'basic OR medium', 'professional OR master/graduate')",
        "Dataset B queries frequently combine student year filters with advisor relationships in multi-condition joins",
        "Dataset B contains explicit requests for faculty membership statistics (e.g., 'how many teachers are faculty employees') as standalone metrics"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B questions frequently require multi-field outputs (e.g., combining IDs, levels, or statuses), while A typically requests single attributes.",
        "Dataset B explicitly categorizes courses by program type (e.g., 'undergraduate,' 'professional,' 'master/graduate'), whereas A focuses solely on difficulty levels (basic/medium/high).",
        "Dataset B includes compound conditional aggregations (e.g., 'no more than two high-level courses'), while A uses simpler aggregations like 'how many' or 'most.'",
        "Dataset B contains yes/no questions (e.g., 'Is the teacher a faculty member?'), which are absent in A.",
        "Dataset B uses numerical ID ranges (e.g., 'person IDs from 40 to 50') in filters, whereas A relies on exact singular values.",
        "Dataset B integrates student years in programs with faculty status checks (e.g., 'students in 8th year with faculty-position professors'), while A focuses on standalone duration filters.",
        "Dataset B explicitly references granular faculty position statuses (e.g., 'position status in faculty'), whereas A only mentions general faculty membership.",
        "Dataset B requires ranking operations (e.g., 'professor who taught the least amount of courses'), while A\u2019s aggregations are limited to counts or extremes like 'highest level.'",
        "Dataset B combines course categories with faculty membership constraints (e.g., 'courses taught by faculty members'), adding layered filters absent in A.",
        "Dataset B distinguishes between course types like 'professional' and 'master/graduate,' while A uses only difficulty-based classifications."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries explicitly request listing multiple attributes simultaneously (e.g., 'person IDs and course levels', 'course IDs and levels', 'position status and IDs') while A typically focuses on single attributes per query.",
        "Dataset B includes direct requests for counts of course types (e.g., 'How many basic or medium undergraduate courses?') as primary objectives, whereas A uses counts primarily for filtering professors (e.g., '\u22652 courses').",
        "Dataset B references specific student details (e.g., 'student ID 80', 'student ID \"303\"') and ties them to advisor roles, while A focuses on professors' attributes without explicit student ID linkages.",
        "Dataset B uses positional or employment status terms like 'position status', 'affiliated professors in faculty', and 'faculty employees', which are absent in A\u2019s samples.",
        "Dataset B includes explicit range filters for IDs (e.g., 'person IDs from 40 to 50'), whereas A uses single ID/value filters (e.g., 'professor ID 297').",
        "Dataset B contains compound course-level categories (e.g., 'basic or medium undergraduate', 'professional or master/graduate'), while A typically references singular levels (e.g., 'masters', 'advanced').",
        "Dataset B asks for confirmation or existence checks (e.g., 'Is the teacher... a faculty member?'), a pattern not observed in A.",
        "Dataset B explicitly queries about students\u2019 years in programs in relation to their advisors (e.g., 'students with eighth year of program'), whereas A ties years in program directly to professors\u2019 teaching eligibility.",
        "Dataset B uses the term 'teacher' interchangeably with 'professor', while A exclusively uses 'professor'.",
        "Dataset B includes student-focused aggregation (e.g., 'students in the 3rd year... advised by how many professors?'), whereas A\u2019s aggregations focus solely on professors\u2019 teaching metrics."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries combining course level with academic degree (e.g., 'undergraduate courses', 'professional or master/graduate') as compound filters",
        "Dataset B explicitly requires faculty affiliation checks ('affiliated professors in faculty') as part of entity qualification criteria",
        "Dataset B contains queries requesting paired outputs of identifiers with categorical attributes (e.g., 'person IDs and course levels', 'IDs and years in program')",
        "Dataset B includes position status verification alongside faculty membership in result outputs (e.g., 'position status in faculty')",
        "Dataset B features numeric range filters for IDs (e.g., 'person IDs from 40 to 50') not present in A",
        "Dataset B requires comparisons of teaching quantities using superlatives ('most courses', 'least amount of courses') with result linkage to course levels",
        "Dataset B specifies 8th year program status for students, extending beyond the 5th year maximum shown in A",
        "Dataset B includes explicit course difficulty qualifiers ('high-level or harder') combined with degree levels",
        "Dataset B contains queries that require counting professors rather than courses as primary focus (e.g., 'how many professors teaches...')",
        "Dataset B asks for verification of faculty employment status as standalone Boolean outputs (e.g., 'Is the teacher... a faculty member?')"
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries requesting URLs (e.g., rating URLs, movie URLs, director page URLs), while A does not reference URLs.",
        "Dataset B explicitly asks for metadata like cover images (e.g., 'cover image of the user') and list descriptions, whereas A focuses purely on numerical/textual attributes.",
        "Dataset B contains questions about critic interactions (e.g., 'ratings with a critic'), which are absent in A.",
        "Dataset B includes time-bound eligibility checks (e.g., 'when he or she was a paying subscriber' during list creation), while A only checks static subscription status.",
        "Dataset B explicitly requests timestamps for first/last occurrences (e.g., 'first movie released', 'updated most recently'), whereas A uses timestamps only for filtering thresholds.",
        "Dataset B requires handling of social engagement metrics like 'comments' and 'likes' (e.g., 'received 20 likes'), which are absent in A.",
        "Dataset B includes multi-part questions combining counts with resource links (e.g., 'how many users... and what is the image URL'), while A focuses on singular metric extraction.",
        "Dataset B queries explicit list ownership comparisons (e.g., 'how many more movie lists were created by the user'), whereas A focuses on follower counts or movie counts within lists.",
        "Dataset B includes direct yes/no verification questions (e.g., 'Was user X a trialist when...'), while A uses trialist status only for aggregations/percentages.",
        "Dataset B references list update timestamps for prioritization (e.g., 'updated most recently'), while A only uses creation timestamps for filtering."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about user subscription status (e.g., 'paying subscriber', 'trialist') in filtering criteria, which are absent in Dataset A.",
        "Dataset B requests URLs specific to user-generated rating pages (e.g., individual user ratings) and critic-related links, whereas Dataset A focuses on general movie/list URLs.",
        "Dataset B involves percentage calculations (e.g., 'percentage of rated movies'), while Dataset A only uses absolute counts.",
        "Dataset B explicitly references critic interactions (e.g., 'critic comments', 'links to ratings with a critic'), which are not present in Dataset A.",
        "Dataset B combines multiple temporal and conditional filters (e.g., 'users who were eligible for trial when they rated [movie]') in a single query, whereas Dataset A uses simpler filters.",
        "Dataset B asks for descriptive metadata (e.g., 'description of user\u2019s movie list', 'cover image'), while Dataset A focuses on structural data like titles and timestamps.",
        "Dataset B includes comparative queries (e.g., 'how many more', 'most recently updated') that require relative analysis, unlike Dataset A\u2019s absolute counts.",
        "Dataset B references user eligibility states (e.g., 'eligible for trial') during actions like rating or list creation, which Dataset A omits.",
        "Dataset B filters by specific rating scores (e.g., 'rated \"1\"', 'rating of 4') in user subgroups, whereas Dataset A only asks for highest/overall scores.",
        "Dataset B requires identifying whether a user\u2019s action (e.g., creating a list) occurred during a subscription period, a detail absent in Dataset A."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes requests for specific temporal data such as exact rating dates (e.g., \"rating date\"), while A focuses on broader time periods (e.g., release year).",
        "Dataset B explicitly asks for URLs tied to individual user actions (e.g., a user's rating URL), whereas A focuses on general movie/list/director page URLs.",
        "Dataset B references list descriptions (e.g., \"description of user's movie list\"), which are absent in A's queries.",
        "Dataset B includes user profile or cover image URLs (e.g., \"cover image of the user\"), a feature not present in A.",
        "Dataset B incorporates critic-related interactions (e.g., \"critic comments\"), while A focuses solely on user ratings and metrics.",
        "Dataset B involves social engagement metrics like \"likes\" on user ratings, absent in A.",
        "Dataset B contains yes/no questions about user status during actions (e.g., \"Was user... a trialist?\"), whereas A focuses on counts/aggregates.",
        "Dataset B uses precise follower count ranges (e.g., \"followers between 1-2\"), while A uses thresholds (e.g., \"more than 100 followers\").",
        "Dataset B requests image URLs (e.g., \"image URL to the movie\"), which are not mentioned in A.",
        "Dataset B requires tie-breakers and multi-metric resolutions for director queries (e.g., listing all tied directors and their highest ratings), whereas A asks for simpler rankings (e.g., \"directors with the most movies\")."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes queries requesting specific URLs or links (e.g., movie pages, rating URLs, list pages), while Dataset A does not reference URLs.",
        "Queries in Dataset B explicitly reference user subscription statuses (e.g., 'paying subscriber,' 'trialist') as filtering criteria, which are absent in Dataset A.",
        "Dataset B contains questions about the temporal status of user actions (e.g., 'when he/she was a paying subscriber'), tying user metadata to specific time-bound events, unlike Dataset A.",
        "Dataset B includes requests for descriptive text fields (e.g., list descriptions, movie cover images), whereas Dataset A focuses solely on numerical or categorical metrics.",
        "Queries in Dataset B frequently combine multiple conditional layers (e.g., user status + rating score + temporal range), creating compound logical filters not seen in Dataset A.",
        "Dataset B explicitly asks for percentages (e.g., 'percentage of rated movies') as outputs, whereas Dataset A uses only absolute counts or averages.",
        "Dataset B references platform-specific entities like 'Mubi' and 'critic comments' directly in queries, while Dataset A uses generic terms like 'the database' or 'critic likes'.",
        "Queries in Dataset B request binary yes/no outcomes (e.g., 'Was user X a trialist?'), which are absent in Dataset A's purely quantitative questions.",
        "Dataset B includes queries about user-generated content metadata (e.g., 'list page URLs,' 'cover images'), while Dataset A focuses on movie/list attributes without platform-specific metadata.",
        "Dataset B requires tracking user activity across hybrid criteria (e.g., 'users eligible for trial when they rated'), integrating temporal and status-based conditions not present in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries requesting URLs (e.g., movie pages, rating URLs, list URLs), while A does not reference URLs.",
        "Dataset B explicitly asks for user subscription status (e.g., trialists, paying subscribers) during actions like rating or list creation, whereas A lacks subscription context.",
        "Dataset B contains multi-part questions requiring compound outputs (e.g., 'list names and popularity'), while A focuses on single-value answers.",
        "Dataset B references movie directors and director-related metrics (e.g., director pages, director counts), which are absent in A.",
        "Dataset B includes temporal comparisons beyond simple date filters (e.g., 'most recently updated list in 2016'), while A uses basic date ranges.",
        "Dataset B requires percentage calculations (e.g., '% of rated movies released in 2021'), whereas A only uses absolute counts/averages.",
        "Dataset B queries metadata beyond titles/IDs (e.g., list descriptions, cover images, critic comments), while A focuses on core attributes.",
        "Dataset B includes platform-specific entities like Mubi user eligibility and critic interactions, which are absent in A's generic user/movie focus.",
        "Dataset B uses complex conditional logic (e.g., 'if multiple directors tie, list all...'), while A uses straightforward ranking/aggregation.",
        "Dataset B explicitly references external content links (e.g., movie pages on Mubi), whereas A remains platform-agnostic."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in B explicitly request multiple output fields (e.g., title + rating date + score), while A focuses on single-value responses",
        "B includes percentage-based calculations (e.g., 'percentage of rated movies'), which never appear in A",
        "B contains questions requiring comparisons between quantities (e.g., 'how many more...'), unlike A",
        "B specifically requests critic-related data (e.g., ratings with critics) absent in A's samples",
        "B asks for cover images/profile images (e.g., user cover image), while A only requests URL links to core entities",
        "B includes temporal range comparisons (e.g., 'between 1/1/2017 to 12/31/2017') rather than A's relative time frames (e.g., 'last year')",
        "B requires boolean state confirmation responses (e.g., 'Was user...') that don't appear in A",
        "B explicitly demands dual conditional outputs (e.g., count + subscriber status) in single questions, unlike A's singular outputs",
        "B contains composite identifier requests (e.g., combining director metrics with user comment metrics) not seen in A",
        "B asks for list/movie descriptions in addition to titles, which A never requires"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries explicitly request URLs, links, or image references (e.g., cover images, rating URLs, director pages).",
        "Dataset B includes questions about directors and critic-related interactions (e.g., critic comments, director pages).",
        "Dataset B contains queries requiring exact rating score counts (e.g., 'how many users rated 4') rather than just maximum/aggregate scores.",
        "Dataset B uses percentage-based aggregations (e.g., 'percentage of rated movies from 2021') not seen in A.",
        "Dataset B references user-generated content metadata like list descriptions and follower counts for specific lists.",
        "Dataset B includes questions about user engagement metrics beyond subscriptions (e.g., likes on ratings, comment counts).",
        "Dataset B explicitly requires output of temporal metadata like rating dates alongside results.",
        "Dataset B queries specify exact numerical user IDs (e.g., 39115684) rather than placeholder identifiers like 'user1'.",
        "Dataset B contains multi-part questions combining entity relationships (e.g., linking directors to movies to user ratings).",
        "Dataset B frequently asks about the trial/paying status context at the time of specific actions (e.g., 'when he created the list') rather than general subscription status."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B includes requests for specific URLs or links (e.g., movie URLs, list page URLs, director page URLs), while A does not.",
        "Dataset B explicitly asks for timestamps (e.g., rating dates, list update timestamps, creation dates) in responses, whereas A does not reference temporal metadata beyond release years.",
        "Dataset B requires percentage calculations (e.g., 'percentage of rated movies released in 2021'), while A focuses on absolute counts or averages.",
        "Dataset B queries often tie subscription status to specific actions (e.g., 'created when the user was a paying subscriber'), whereas A treats subscription status as a general attribute.",
        "Dataset B references external media (e.g., cover images, URLs, critic comments) directly, while A does not involve media assets.",
        "Dataset B includes questions with compound results (e.g., combining counts with subscriber status or URLs), whereas A typically requests singular metrics.",
        "Dataset B explicitly asks for user-generated list names (e.g., 'Georgia related films') as identifiers, while A refers to lists generically.",
        "Dataset B requires resolving ties in aggregated results (e.g., 'if multiple directors have the same count, list all'), whereas A assumes uniqueness in results like 'highest rating'.",
        "Dataset B includes comparative queries about list growth (e.g., 'how many more movie lists were created'), while A focuses on static counts.",
        "Dataset B explicitly checks temporal alignment of user status and actions (e.g., 'was the user a trialist when they rated'), whereas A does not link status to event timing."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes sentiment analysis metrics (e.g., sentiment polarity, subjectivity) in queries, absent in A.",
        "Dataset B explicitly requests translated user reviews (e.g., 'translated review'), while A does not reference review text.",
        "Dataset B incorporates content rating specificity (e.g., 'Everyone 10+', 'Mature 17+') as standalone filters, whereas A uses broader content ratings like 'Everyone'.",
        "Dataset B requires temporal filters (e.g., 'not been updated since 2015'), absent in A's queries.",
        "Dataset B queries user sentiment categories (e.g., 'neutral attitude', 'positive favorability'), while A focuses purely on numerical sentiment thresholds (e.g., 'sentiment > 0.5').",
        "Dataset B combines sentiment scores with qualitative metadata (e.g., 'age group targeted'), whereas A combines numerical metrics only (e.g., rating + installs).",
        "Dataset B explicitly asks for review comment counts (e.g., 'how many of the reviews have a comment?'), a feature absent in A.",
        "Dataset B includes sentiment-derived percentages (e.g., 'percentage of positive sentiments'), while A calculates percentages based on installs/ratings only.",
        "Dataset B references app lifecycle attributes (e.g., update dates), whereas A focuses exclusively on static app attributes (e.g., size, price).",
        "Dataset B queries sentiment subjectivity scores (e.g., 'total Sentiment subjectivity score'), a metric not present in A."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes explicit references to user sentiment categories (positive/neutral/negative) rather than just numerical sentiment scores",
        "Dataset B queries frequently request translated user reviews or comments as part of results",
        "Dataset B contains questions about app pricing/commercial aspects (e.g., average price, free vs paid) not present in A",
        "Dataset B incorporates sentiment subjectivity scores alongside polarity scores in analysis",
        "Dataset B includes specific content rating categories (Everyone 10+, Mature 17+) as filters",
        "Dataset B queries combine sentiment analysis with installation numbers/commercial success metrics",
        "Dataset B explicitly requests percentage calculations of sentiment distributions (negative vs positive)",
        "Dataset B contains questions about app update timelines combined with sentiment trends",
        "Dataset B references specific user review text analysis (comments availability, review content)",
        "Dataset B includes demographic targeting information (age groups) in query criteria"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes specific sentiment analysis components (e.g., neutral attitude counts, negative sentiment counts) not present in A, which focuses only on average polarity scores.",
        "Dataset B explicitly requests translated user reviews (e.g., 'state the translated review if available'), while A does not reference review language or translation.",
        "Dataset B incorporates questions about app pricing and economic metrics (e.g., 'average price of games'), whereas A focuses solely on free apps without monetary attributes.",
        "Dataset B requires exact counts of reviews with comments (e.g., 'how many of the reviews... have a comment?'), while A aggregates metrics like total installs or average ratings.",
        "Dataset B emphasizes identification of worst/lowest-rated apps (e.g., 'lowest rated puzzle games'), while A focuses exclusively on top-ranked/highest-performing apps.",
        "Dataset B includes metadata queries about app size (e.g., 'how much is the size of Browser 4G?'), which A never references.",
        "Dataset B quantifies user favorability through percentages (e.g., 'percentage of positive sentiments'), while A uses absolute thresholds like '0.5 or higher' for sentiment polarity.",
        "Dataset B explicitly requests verbatim review content listing (e.g., 'List all of its reviews'), whereas A only aggregates review-related metrics like counts or averages.",
        "Dataset B combines sentiment percentages with other filters (e.g., 'percentage of negative sentiment' + update date), while A combines numerical thresholds without sentiment ratios.",
        "Dataset B includes queries about apps with no negative sentiment (e.g., 'app that does not have negative sentiment'), a distinction absent in A's purely threshold-based sentiment filters."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes queries about sentiment polarity scores and sentiment subjectivity scores, which are not present in Dataset A.",
        "Dataset B contains questions about user attitudes (e.g., 'neutral attitude', 'positive attitude') and sentiment classifications not found in Dataset A.",
        "Dataset B references specific app names in queries (e.g., '10 Best Foods for You', 'Basketball Stars') while Dataset A only uses categories/genres.",
        "Dataset B includes temporal filters related to app updates (e.g., 'not been updated since 2015') that don't appear in Dataset A.",
        "Dataset B asks for translated reviews and multilingual content analysis, which is absent in Dataset A's questions.",
        "Dataset B incorporates pricing metrics (e.g., 'average price of games') that aren't referenced in Dataset A's numerical thresholds.",
        "Dataset B requires percentage calculations (e.g., 'percentage of positive sentiments') while Dataset A focuses on absolute counts and averages.",
        "Dataset B queries include content rating filters (e.g., 'Everyone 10+', 'Mature 17+') as additional parameters beyond Dataset A's category/genre filters.",
        "Dataset B contains questions about app-specific metadata like 'translated reviews' and 'comment existence' that aren't present in Dataset A.",
        "Dataset B explicitly requests combined metrics (e.g., 'total installs AND translated reviews') in single queries, creating multi-output requirements not seen in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in dataset B frequently request multiple distinct data points in a single question (e.g., both metrics and categorical attributes), while A focuses on single metrics per query.",
        "Dataset B explicitly references sentiment analysis metrics (e.g., 'sentiment polarity', 'sentiment subjectivity') not present in A's simpler 'positive/neutral' categorizations.",
        "B includes requests for translated user reviews ('state the translated review'), while A only deals with review counts without linguistic aspects.",
        "Time-based filters (e.g., 'not been updated since 2015') appear exclusively in B's queries.",
        "Percentage calculations ('percentage of negative sentiment') are unique to B's analytical requirements.",
        "B specifically filters by content rating tiers ('Everyone 10+', 'Mature 17+') beyond basic categories used in A.",
        "Pricing analysis combined with other attributes (e.g., 'average price of games... with content rating') occurs only in B.",
        "B explicitly requests raw user-generated text data ('list all of its reviews', 'comments') rather than just aggregated counts like A.",
        "Numerical ranking positions ('top 3', 'top 5', 'top 10') are explicitly quantified in B's queries, while A uses general superlatives ('highest').",
        "Demographic targeting parameters ('age group targeted') appear exclusively in B's questions."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes questions about user sentiment analysis (e.g., sentiment polarity, positive/negative sentiment percentages), while Dataset A does not mention sentiment-related metrics.",
        "Dataset B explicitly requests translated user reviews or comments (e.g., 'translated review if available'), whereas Dataset A focuses solely on app metadata without user-generated text analysis.",
        "Dataset B contains temporal filters related to app updates (e.g., 'not been updated since 2015'), while Dataset A lacks time-based criteria.",
        "Dataset B queries technical app attributes like app size (e.g., 'size of Browser 4G'), whereas Dataset A focuses on ratings, installs, and pricing.",
        "Dataset B includes percentage calculations for sentiment distributions (e.g., 'percentage of negative sentiment'), while Dataset A uses simple numerical thresholds without proportional analysis.",
        "Dataset B combines sentiment metrics with other attributes in single queries (e.g., 'average sentiment polarity score + age group'), whereas Dataset A queries typically focus on one primary metric per question.",
        "Dataset B explicitly requests review counts and comment existence (e.g., 'how many of the reviews have a comment'), while Dataset A does not analyze review content granularity.",
        "Dataset B includes boolean sentiment conditions (e.g., 'apps that do not have negative sentiment'), whereas Dataset A uses only numerical comparisons for filtering.",
        "Dataset B asks for worst-case scenarios (e.g., 'worst rating', 'lowest rated apps'), while Dataset A focuses on averages and top performers without negative extremes.",
        "Dataset B references sentiment subjectivity scores (e.g., 'total Sentiment subjectivity score'), a metric absent in Dataset A's purely objective numerical ratings."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries about sentiment analysis metrics (e.g., sentiment polarity, subjectivity, or positive/negative sentiment percentages), while A focuses only on numerical ratings.",
        "Dataset B explicitly requests translated user reviews (e.g., 'translated review if available'), whereas A does not reference multilingual data.",
        "Dataset B incorporates temporal filters (e.g., 'not been updated since 2015'), while A lacks queries related to app update timelines.",
        "Dataset B asks for percentage-based metrics (e.g., 'percentage of negative sentiment'), whereas A uses absolute counts or averages without percentages.",
        "Dataset B combines multiple distinct metrics in single queries (e.g., 'average rating and user sentiment'), while A typically isolates one metric per query.",
        "Dataset B references app technical metadata (e.g., app size, update status), whereas A focuses on categories, ratings, and installs without technical details.",
        "Dataset B explicitly asks about comment presence (e.g., 'how many reviews have a comment'), while A focuses on review volume rather than comment existence.",
        "Dataset B includes queries for worst-performing apps (e.g., 'App has the worst rating'), whereas A emphasizes top performers or averages.",
        "Dataset B uses granular sentiment qualifiers (e.g., 'neutral attitude,' 'pretty positive favorability'), while A lacks such nuanced sentiment distinctions.",
        "Dataset B queries app-specific scores like 'total Sentiment subjectivity score,' a metric absent in A's focus on ratings and installs."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries about app pricing details (e.g., average price, free vs paid), while A does not reference price.",
        "Queries in B explicitly request translated reviews or comments (e.g., 'translated review if available'), absent in A.",
        "B combines multiple distinct data points (e.g., install count + sentiment percentage) in single queries, unlike A's singular focus.",
        "B uses granular sentiment metrics (e.g., polarity scores like >0.5), whereas A uses general labels (e.g., 'positive sentiment').",
        "B includes percentage-based calculations (e.g., 'percentage of negative sentiment'), not seen in A.",
        "B references app size (e.g., 'size of Browser 4G'), a metadata attribute unmentioned in A.",
        "B explicitly asks for full review/comment lists (e.g., 'list all reviews'), while A only queries aggregated counts or sentiments.",
        "B queries the presence of comments in reviews (e.g., 'how many reviews have a comment'), a feature absent in A.",
        "B specifies sub-genres (e.g., 'arcade genre') within broader categories, whereas A uses only high-level categories like 'Games'.",
        "B includes install ranges (e.g., 'free apps') and pricing models, while A focuses solely on install counts without pricing context."
      ]
    }
  }
}