{
  "sims": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query relationships between professors, students, courses, and advisors using specific IDs.",
        "Questions in both involve filtering by course levels (e.g., basic/medium/high-level in A, Level_200/Level_500 in B).",
        "Both require counting entities (e.g., professors, courses, students) under specific constraints.",
        "Queries in both datasets use faculty employment status (e.g., faculty vs. non-faculty professors) as a filter.",
        "Both include questions about aggregations (e.g., totals, percentages, min/max values, top N rankings).",
        "Questions in both involve cross-referencing student years in the program with advisor/professor assignments.",
        "Both datasets ask for lists of IDs (course, professor, student) tied to specific conditions like teaching loads or advising roles.",
        "Queries in both require joining data across multiple entities (e.g., students advised by professors who teach specific courses).",
        "Both include comparative questions (e.g., 'which course has more teachers' in A, 'highest course level' in B).",
        "Questions in both use thresholds (e.g., 'more than 4 students', 'more than 2 years in the program') to filter results."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets focus on querying relationships between professors/advisors and the courses they teach.",
        "Questions in both datasets frequently use specific numerical IDs to identify professors, students, advisors, and courses.",
        "Both include requests to count entities (e.g., courses taught, students advised, professors in a category).",
        "Course levels (e.g., basic, medium, high-level, Level_500) are explicitly queried in both datasets.",
        "Questions in both datasets ask for lists of IDs associated with professors, courses, or students meeting specific criteria.",
        "Both datasets involve aggregations (e.g., totals, percentages, counts) related to academic roles and course attributes.",
        "Queries about advising relationships (e.g., students advised by a specific professor/advisor) appear in both datasets.",
        "Both include comparisons between courses (e.g., which course has more teachers, course levels).",
        "Questions in both datasets require filtering results based on categorical criteria (e.g., faculty status, course difficulty).",
        "Both datasets structure queries around hierarchical academic roles (e.g., professors, faculty employees, advisors, students)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries focus on aggregating counts (e.g., 'how many professors', 'total number of students').",
        "Questions reference specific numerical identifiers (e.g., course IDs, professor IDs, student IDs).",
        "Both datasets involve filtering by course level (e.g., 'basic', 'medium', 'high-level', 'Level 300').",
        "Queries frequently link advisors/professors to their students or courses taught.",
        "Questions require joins between entities (e.g., courses to professors, advisors to students).",
        "Both include requests to list IDs or specific attributes (e.g., 'list course IDs', 'state the level').",
        "Queries use conditional logic (e.g., 'taught by more than 4 people', 'not a faculty member').",
        "Questions target faculty affiliation or employment status (e.g., 'faculty member', 'professors in Faculty').",
        "Both datasets ask about comparative quantities (e.g., 'which course has more teachers', 'taught by at least two').",
        "Queries involve temporal or program-phase criteria (e.g., '5th year', 'master/graduate phase')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets include questions about course levels (e.g., 'basic or medium undergraduate' in A and 'graduate level' in B).",
        "Both focus on quantifying professors/teachers and their teaching responsibilities (e.g., 'How many professors teaches...' in A and 'How many courses are taught by professors?' in B).",
        "Both require filtering by specific roles (e.g., 'faculty employee professors' in A and 'professors' in B).",
        "Both involve counting students in specific program phases (e.g., 'students in the 5th year' in A and 'students in phase 2' in B).",
        "Both include queries about advisor-student relationships (e.g., 'advisors of students' in A and 'advised by a professor' in B).",
        "Both datasets use numeric IDs to identify entities (e.g., 'student ID 80' in A and 'professor with ID 123' in B).",
        "Both ask for lists of course IDs tied to specific criteria (e.g., 'professional courses' in A and 'graduate courses' in B).",
        "Both involve aggregations like totals, percentages, and counts (e.g., 'Calculate the percentage...' in A and 'total number of students...' in B).",
        "Both include comparisons of course popularity (e.g., 'Which course has more teachers...' in A and 'Which courses have more than one professor...' in B).",
        "Both reference academic program structures (e.g., 'years in the program' in A and 'students enrolled in the program' in B)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query course levels using specific course IDs or person IDs.",
        "Both involve retrieving courses taught by specific professors identified by their IDs.",
        "Both datasets ask about the number of courses taught by individual professors.",
        "Both include questions about the advisor-student relationship, referencing advisor or professor IDs.",
        "Queries in both use numerical identifiers (IDs) for courses, professors, and students.",
        "Both datasets filter results based on academic roles (e.g., professor, student, faculty).",
        "Both include requests to list IDs (course, person, advisor) based on specific criteria.",
        "Queries in both count entities (courses, students, professors) meeting certain conditions.",
        "Both focus on associations between academic personnel (professors/students) and courses they teach/advise.",
        "Questions in both require database joins (e.g., linking courses to professors or advisors to students)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query specific course levels (e.g., basic/medium/high-level in A, Level_400/Level_500 in B).",
        "Both require linking professors/teachers to the courses they teach using identifiers like IDs.",
        "Both include questions about courses taught by professors with specific numerical IDs (e.g., 'teacher no.79' in A, 'professor 150' in B).",
        "Both reference explicit course IDs (e.g., 'course no.11' in A, 'course id 120' in B).",
        "Both involve filtering or listing entities based on categorical course levels (e.g., 'high-level' or 'Level_500').",
        "Both ask for professors/teachers associated with a specific course or course level.",
        "Both use numerical identifiers for professors, students, or advisors (e.g., 'ID 415' in A, 'p_id 201' in B).",
        "Both include questions about the relationship between professors and their taught courses (e.g., 'courses taught by Professor X').",
        "Both require direct retrieval of attributes like course level or instructor ID from structured data.",
        "Both focus on granular entity relationships (e.g., professor-to-course, advisor-to-student, or course-to-level)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets focus on querying relationships between professors/advisors and courses/students using specific IDs or attributes.",
        "Questions in both datasets frequently involve filtering by course level (e.g., 'high-level', 'medium undergraduate', 'masters').",
        "Both require counting/statistical operations (e.g., 'how many', 'percentage', 'most courses taught').",
        "Queries in both sets use explicit numerical thresholds (e.g., 'more than 4', 'at least 5 years', 'no more than two').",
        "Both datasets ask for listings of IDs associated with specific entities (courses, professors, students, advisors).",
        "Questions in both involve conditional logic with AND/OR operators (e.g., 'basic or medium', 'high-level or harder').",
        "Both datasets include queries about teaching/advising relationships (professor\u2192courses and advisor\u2192students).",
        "Queries in both require joining entity attributes (e.g., professor's yearsInProgram with course levels taught).",
        "Both datasets ask for identification of top performers (e.g., 'top 3 professors', 'professor with most courses').",
        "Questions in both involve verifying faculty/employment status of professors (Dataset A explicitly, Dataset B implicitly through program duration)"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets focus on querying course levels (e.g., 'basic', 'undergraduate', 'graduate', 'high-level').",
        "Both involve identifying professors/teachers and their taught courses.",
        "Both require counting entities (e.g., courses, professors, students, advisors).",
        "Both include queries about specific IDs (e.g., course IDs, professor IDs, student IDs).",
        "Both address advisor-student relationships and academic roles (e.g., 'advised by', 'faculty member').",
        "Both involve filtering results by academic status (e.g., '5th year students', 'Phase 3 students').",
        "Both use numerical thresholds in queries (e.g., 'more than 4', 'no more than two').",
        "Both require aggregations like percentages and totals for course statistics.",
        "Both explicitly reference academic roles (e.g., 'professor', 'faculty employee', 'advisor').",
        "Both emphasize entity relationships (e.g., professors teaching courses, advisors mentoring students)."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets include queries about average rating scores for movies directed by specific directors.",
        "Both datasets require filtering users based on subscription status (paying subscribers/trialists).",
        "Both datasets ask for counts of movies in user-created lists with specific follower thresholds.",
        "Both datasets involve time-based constraints (e.g., creation/update timestamps, rating dates).",
        "Both datasets query user-generated movie lists by title, follower count, or creation/update dates.",
        "Both datasets include questions about movies with specific rating thresholds (e.g., \u22654 stars).",
        "Both datasets require aggregations (e.g., averages, percentages, counts) across user ratings.",
        "Both datasets reference user IDs and movie IDs as primary identifiers for joins or filters.",
        "Both datasets ask for top-ranked items (e.g., most followers, highest-rated movies).",
        "Both datasets involve conditional logic (e.g., 'users who are both trialists and subscribers')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets include queries about movie titles and attributes like release year, rating scores, and popularity.",
        "Questions in both datasets involve counting entities such as users, movies, lists, and followers.",
        "Both focus on user-generated movie lists, including details like titles, followers, and creation/update timestamps.",
        "Aggregation functions (e.g., average, highest, most) are frequently used in questions across both datasets.",
        "Temporal aspects (e.g., release years, list creation/update dates) are queried in both datasets.",
        "Specific identifiers (e.g., movie IDs, user IDs, list titles) are used to retrieve granular details in both datasets.",
        "Both datasets seek top-ranked or extreme values (e.g., highest-rated movies, most-followed lists).",
        "Conditional filtering (e.g., 'more than 200 followers,' 'released after 2000') is a common feature in queries.",
        "External resource references (e.g., URLs to profiles, ratings, or cover images) appear in both datasets.",
        "Both include questions about directors associated with movies (e.g., listing films by specific directors)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets involve queries about user subscription status (e.g., paying subscribers, trial eligibility, payment methods).",
        "Both include questions requiring movie identification via explicit IDs (e.g., movie ID 1269 in A, movie ID 1000 in B).",
        "Both datasets ask for comparisons/aggregations of movie ratings (e.g., average scores, highest/lowest ratings).",
        "Both reference movie release years as a key filter (e.g., 'released on 1976' in A, 'movie in 2014' in B).",
        "Both involve queries about user-generated movie lists (e.g., list titles, followers, creation/update dates).",
        "Both require counting specific user/movie interactions (e.g., 'how many users rated' in A, 'number of ratings' in B).",
        "Both include questions about list popularity metrics (e.g., followers >200 in A, 'most followers' in B).",
        "Both use temporal filters for ratings/list updates (e.g., 'rated on 4/19/2020' in A, 'updated in the last month' in B).",
        "Both involve director-specific queries (e.g., 'directed by Felipe Cazals' in A, 'directed by James Cameron' in B).",
        "Both require multi-condition filtering (e.g., 'subscriber + trial eligible + rating \u22642' in A, 'payment method + top ratings' in B)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets include questions about calculating average rating scores for movies.",
        "Both datasets query specific movie attributes such as release year, director, and title.",
        "Both datasets involve filtering data based on user subscription status or eligibility (e.g., paying subscribers, trial users).",
        "Both datasets require aggregation operations (e.g., counts, averages) to summarize numerical metrics like ratings or user counts.",
        "Both datasets ask about popularity metrics, such as 'most popular movie' or lists with the most followers.",
        "Both datasets include questions that filter results by date ranges (e.g., ratings in April 2020, lists created after 2010).",
        "Both datasets reference user-generated content, such as movie lists, including their titles, update timestamps, and follower counts.",
        "Both datasets ask for comparisons or rankings (e.g., 'highest rating score,' 'longest period without updates').",
        "Both datasets require joining user data (e.g., profiles, subscriptions) with movie ratings or lists.",
        "Both datasets include questions about external URLs (e.g., user profile images, rating pages)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries focus on retrieving movie titles based on specific criteria (e.g., highest ratings, popularity, or IDs).",
        "Questions involve aggregating numerical data (e.g., average ratings, total followers, counts of ratings/users).",
        "Both datasets reference movie attributes like rating scores, popularity, release years, and titles.",
        "Questions target list properties such as follower counts, creation dates, and titles.",
        "Queries include filtering by unique identifiers (e.g., movie IDs, user IDs, list IDs).",
        "Both datasets prioritize comparisons or rankings (e.g., \"highest,\" \"most,\" \"lowest\").",
        "Questions frequently involve user-generated content (e.g., lists, ratings, followers).",
        "Queries require extracting metadata (e.g., movie titles, list titles, user IDs).",
        "Both datasets include time-based filters (e.g., creation dates, updates, release years).",
        "Questions often involve conditional logic (e.g., thresholds like \"greater than 3\" or \"over 200 followers\")."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets include questions about movie ratings, including specific rating scores (e.g., 4 or 5).",
        "Queries involve user-specific attributes such as subscription status (e.g., paying subscriber, trial eligibility) or payment methods.",
        "Both reference movie lists with properties like titles, IDs, follower counts, and creation/update timelines.",
        "Questions utilize unique identifiers (e.g., movie IDs, user IDs, list IDs) to target specific entities.",
        "Both datasets require aggregations like counts, averages, percentages, or rankings (e.g., 'most popular,' 'highest rating').",
        "Time-based constraints are present (e.g., ratings on specific dates, lists updated within a timeframe).",
        "Questions compare movies or lists using metrics like popularity, ratings, or follower counts.",
        "Both involve conditional logic combining user attributes (e.g., subscriber status) with actions (e.g., rating a movie).",
        "Queries reference movie metadata such as release years, genres, directors, and cover/URL assets.",
        "Both focus on list popularity metrics, including follower counts and update frequency."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Queries involve filtering movies based on user subscription status (e.g., paying, trial).",
        "Questions reference numerical thresholds (e.g., ratings \u22642, lists with >200 followers).",
        "Requests for aggregated metrics (e.g., averages, counts, top-N rankings) are frequent.",
        "Explicit use of movie/user IDs (e.g., 'movie id 1269', 'user ID 1') in criteria.",
        "Focus on time-based constraints (e.g., ratings on 4/19/2020, lists updated after 2021-01-01).",
        "Interest in popularity metrics (e.g., 'most popular movie', 'highest popularity').",
        "Queries target user-generated content (e.g., movie lists, followers, creator status).",
        "Requests for URLs or metadata (e.g., profile images, rating pages, cover images).",
        "Explicit joins between entities (e.g., movies rated by list creators, users with profile pictures).",
        "Frequent use of multi-condition filters (e.g., subscribers who rated \u22642, paid users with profile pics)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets query about movie rating scores (e.g., highest, average, or specific thresholds like '>8' or '<=2').",
        "Both include questions about user engagement metrics (e.g., number of users who rated a movie, paying subscribers, or trial eligibility).",
        "Both reference movie lists and their metadata (e.g., titles, followers, update dates, creators).",
        "Both ask for counts of entities like users, movies, ratings, or lists meeting specific criteria (e.g., 'how many movies were rated by users with a payment method').",
        "Both involve filtering by user status (e.g., paying subscribers, trial eligibility, or subscriber status during list creation).",
        "Both include queries about movies released in specific years (e.g., 'movies released in 2020' or 'movies directed by X in 1976').",
        "Both require aggregating numerical data (e.g., averages, percentages, totals).",
        "Both reference movie attributes like titles, directors, release years, and cover/profile image URLs.",
        "Both involve comparisons (e.g., 'how much higher is the average rating' or 'movies with the highest rating score').",
        "Both include time-based constraints (e.g., ratings on specific dates, lists updated after 2010, or activity within 10 years)."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets query app ratings (e.g., specific values, averages, or thresholds like 'rating \u22654.0').",
        "Both filter results using installation counts (e.g., '10,000+ installs' or '100,000,000+ installs').",
        "Both categorize apps using attributes like genre (A) or category (B) (e.g., 'dating application' or 'FAMILY category').",
        "Both use aggregation functions (e.g., average, count, total) to derive insights (e.g., 'average price' or 'percentage ratio').",
        "Both combine multiple criteria in queries (e.g., 'free apps with >10,000 installs and rating \u22654.2' or 'rating + sentiment polarity + genre').",
        "Both include questions seeking ranked/top-N results (e.g., 'top 5 shopping apps' or 'most reviewed app').",
        "Both reference app pricing models (e.g., 'free' vs. paid) as a filter or metric.",
        "Both involve filtering or analysis based on sentiment (A: polarity scores, B: 'positive sentiment reviews').",
        "Both use explicit numerical thresholds (e.g., 'rating of 3.9', 'size <50M', 'sentiment polarity >0.5').",
        "Both request statistical comparisons (e.g., 'percentage ratio of positive sentiments' or 'average rating comparisons')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Both datasets focus on querying app ratings, including average ratings and counts of apps with specific ratings (e.g., 4.0+ in B, 5.0 in A).",
        "Both use numerical thresholds to filter results (e.g., ratings >4.0 in B, sentiment polarity >0.5 in A).",
        "Both aggregate data using functions like average, count, total, and percentage (e.g., average rating in B, percentage ratio in A).",
        "Both query app categories/genres (e.g., 'GAME' in B, 'role playing game' in A).",
        "Both reference app metadata such as price (A) or free status (B) and install/review counts (e.g., 100M+ installs in A, 10K+ reviews in B).",
        "Both include queries about specific apps by name (e.g., 'Fun Cube 2' in B, 'Dragon Ball Legends' in A).",
        "Both involve sentiment analysis (A explicitly mentions polarity/subjectivity; B includes queries about positive reviews).",
        "Both request ranked or extreme values (e.g., 'top 5 shopping apps' in A, 'highest rating' in B).",
        "Both combine multiple criteria in queries (e.g., category + rating in B, genre + sentiment in A).",
        "Both target app attributes available in app store datasets (e.g., content ratings like 'Teen' in A, categories like 'FAMILY' in B)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Both datasets focus on querying app ratings (e.g., 'rating of 5' in A, 'rating of 4.5 or higher' in B).",
        "Both involve filtering by app genres/categories (e.g., 'racing genre' in A, 'action genre' in B).",
        "Both require aggregating install counts (e.g., '5,000+ installs' in A, '1,000 installs' in B).",
        "Both include sentiment analysis metrics (e.g., 'sentiment polarity score' in A, 'average sentiment polarity' in B).",
        "Both ask for ranked/top-N results (e.g., 'top 3 genre' in A, 'top 5 most popular apps' in B).",
        "Both combine multiple filters (e.g., 'rating + translated review' in A, 'rating + installs + genre' in B).",
        "Both reference app metadata (e.g., 'current version' in A, 'apps updated since 2020' in B).",
        "Both calculate averages/percentages (e.g., 'average price' in A, 'average rating' in B).",
        "Both query free vs. paid app distinctions (e.g., 'free sports Apps' in A, 'most popular free applications' in B).",
        "Both use exact numerical thresholds (e.g., '3.9 rating' in A, '100,000 installs' in B)."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Both datasets involve queries about app ratings, including average ratings, specific rating values, and counts of apps with certain ratings.",
        "Questions in both datasets filter results using categories (e.g., 'Tools' in B) or genres (e.g., 'role-playing' in A) as criteria.",
        "Sentiment-related metrics (e.g., sentiment polarity scores in A, positive/negative sentiment reviews in B) are analyzed in queries.",
        "Aggregate functions like average, count, and sum are frequently used to derive insights (e.g., average price in A, average rating in B).",
        "Numerical thresholds (e.g., install counts, review counts, or rating values) are used to filter or segment app data in both datasets.",
        "Queries often combine multiple criteria (e.g., category + rating in B, installs + sentiment in A) to refine results.",
        "Top-N ranking queries (e.g., 'top 5 most reviewed apps' in B, 'top 3 genres' in A) are present in both datasets.",
        "Both datasets include questions about the total or percentage distribution of sentiment categories (e.g., neutral/negative comments in A, positive sentiment ratios in B).",
        "Specific apps are referenced by name in queries (e.g., 'Dragon Ball Legends' in A, 'Super Mario Bros.' in B).",
        "Review counts and their relationship to other metrics (e.g., ratings, installs) are a recurring theme in both datasets."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query average ratings of apps within specific categories/genres.",
        "Both datasets inquire about app-specific metrics such as ratings, reviews, or install counts.",
        "Both focus on filtering apps by categorical attributes (e.g., genre in A, category in B).",
        "Both include questions about user sentiment (e.g., sentiment polarity in A, positive reviews in B).",
        "Both use aggregate functions like average, count, and total for analysis.",
        "Both datasets ask for app details tied to specific attributes (e.g., translated reviews in A, size in B).",
        "Both involve filtering apps by install ranges (e.g., 5,000+ installs in A, free games with >4.0 rating in B).",
        "Both reference content ratings or target demographics (e.g., Teen in A, categories like FAMILY in B).",
        "Both include questions about individual apps (e.g., Dragon Ball Legends in A, HTC Weather in B).",
        "Both use comparative metrics (e.g., top/lowest-rated apps in A, highest-rated apps in B)."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Both datasets query app ratings (specific values, averages, or thresholds).",
        "Both filter apps by category/genre (e.g., \"racing genre\" in A, \"FAMILY category\" in B).",
        "Both include install/download metrics as criteria (e.g., \"5,000+ installs\" in A, \"1,000,000+ installs\" in B).",
        "Both reference specific apps by name (e.g., \"Dragon Ball Legends\" in A, \"Space Z\" in B).",
        "Both use aggregation functions like average, count, or sum (e.g., \"average price\" in A, \"average rating\" in B).",
        "Both involve content rating filters (e.g., \"Teen\" in A, \"teen content rating\" in B).",
        "Both combine multiple criteria (e.g., category + rating + installs) in queries.",
        "Both differentiate between free and paid apps as a filtering condition.",
        "Both include ranking-based queries (e.g., \"top 5\" in A, \"top-rated apps\" in B).",
        "Both require filtering by app characteristics like price, category, or content rating."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Both datasets focus on querying app ratings, including average ratings and specific rating thresholds.",
        "Both datasets include questions about app categories or genres (e.g., 'Entertainment', 'Action', 'role playing games').",
        "Both datasets analyze install counts/download numbers as a key metric.",
        "Both datasets contain queries about sentiment analysis (polarity scores in A, sentiment categories in B).",
        "Both datasets require identification of top-ranked apps based on various criteria (e.g., 'top 5 most reviewed', 'best selling app').",
        "Both datasets include specific references to named applications (e.g., 'Twitter', 'Dragon Ball Legends').",
        "Both datasets utilize numerical filters for queries (e.g., 'rating above 4.2', 'price less than 5 dollars').",
        "Both datasets examine installation thresholds (e.g., '10 million times', '5,000+ installs').",
        "Both datasets involve category-specific metric calculations (e.g., average rating per genre, sentiment scores per category).",
        "Both datasets incorporate analysis of user reviews through quantitative measures (translated reviews in A, positive review counts in B)."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Both datasets query app ratings (e.g., averages, thresholds, comparisons).",
        "Both involve filtering by app categories or genres (e.g., 'Games,' 'Tools,' 'Sports').",
        "Both include sentiment-related metrics (e.g., polarity scores, positive/negative classifications).",
        "Both require counting specific entities (e.g., apps, reviews, installs, sentiments).",
        "Both reference named apps directly (e.g., 'Instagram,' 'Dragon Ball Legends').",
        "Both use aggregate functions (e.g., AVG, MAX, COUNT) for analysis.",
        "Both ask for top-ranked apps (e.g., 'highest rating,' 'most reviews').",
        "Both apply numerical thresholds (e.g., ratings >4.0, installs >5,000+).",
        "Both generate ranked lists (e.g., 'top 5 apps,' 'top 3 genres').",
        "Both focus on quantitative app performance metrics (e.g., ratings, installs, reviews)."
      ]
    }
  },
  "diffs_synth_from_real": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes questions about student phases (e.g., 'Pre_Quals', 'Phase 0') not present in A.",
        "Dataset B explicitly references faculty departments (e.g., 'Faculty of Mathematics') as a filter, while A does not.",
        "Dataset B asks for the minimum and maximum values of student years in the program (e.g., 'min and max number of years'), whereas A focuses on specific year thresholds.",
        "Dataset B includes questions about non-professional courses, while A only references professional courses.",
        "Dataset B combines counts of multiple entity types in a single query (e.g., 'total number of professors and students'), which A avoids.",
        "Dataset B references students who 'have not completed their program yet,' a completion status not mentioned in A.",
        "Dataset B uses positional filters (e.g., 'people with a position in a department') absent in A.",
        "Dataset B explicitly asks for courses taught by faculty members as a distinct category, while A distinguishes faculty vs. non-faculty professors without grouping faculty members as a teaching category.",
        "Dataset B includes queries about the 'highest course level' as a standalone metric, while A focuses on comparative metrics (e.g., 'which course has more teachers').",
        "Dataset B references course names (e.g., 'names of the courses') instead of only IDs and levels as in A."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes questions about personal names (e.g., 'name of the student'), while A exclusively references numerical IDs without naming entities.",
        "Dataset B uses standardized course level labels (e.g., 'Level_500'), whereas A uses descriptive terms like 'basic,' 'medium,' or 'high-level' without explicit level codes.",
        "Dataset B contains repetitive queries about courses taught by specific professor IDs (e.g., multiple variations of 'courses taught by professor with ID X'), while A diversifies query structures.",
        "Dataset B lacks multi-criteria filtering (e.g., 'high-level or harder undergraduate courses taught by faculty members'), which is common in A.",
        "Dataset B includes a query about a person holding dual roles (student and professor), a scenario absent in A.",
        "Dataset B asks for the 'highest course level' (max aggregation), while A focuses on percentages and proportional distributions of course levels.",
        "Dataset B omits questions involving hierarchical academic role comparisons (e.g., 'faculty employee professors') prevalent in A.",
        "Dataset B does not include percentage-based calculations (e.g., 'percentage of high-level courses'), which are frequent in A.",
        "Dataset B features simpler aggregation targets (e.g., 'total number of students') compared to A's complex aggregations like 'top 3 professors teaching the most courses'.",
        "Dataset B lacks queries about temporal or program-year-based student-advisor relationships (e.g., 'students in the 5th year of their program'), a recurring theme in A."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries in dataset B include filtering based on professors' years of teaching experience (e.g., '5 or more years'), which is absent in A.",
        "Dataset B references specific professor positions or titles (e.g., 'full-time', 'Faculty_eme') beyond general faculty affiliation, unlike A.",
        "B includes requests for average calculations (e.g., 'average number of students per professor'), whereas A focuses on totals and percentages.",
        "Questions in B explicitly ask for student names (e.g., 'names of students'), while A only references numerical IDs.",
        "Dataset B mentions specific course names (e.g., 'Data Structures and Algorithms'), which A does not.",
        "B uses superlative terms (e.g., 'highest level undergraduate course') for criteria, whereas A uses comparative terms like 'high-level or harder'.",
        "Queries in B combine multiple attribute conditions (e.g., position and course count) in a single question, unlike A's simpler filters.",
        "Dataset B asks about students with multiple advisors (e.g., 'more than one advisor'), a criterion absent in A.",
        "B includes questions about courses taught exclusively by one professor (e.g., 'only one professor teaching it'), which A does not address.",
        "Temporal criteria in B are phrased as program phases (e.g., 'Phase 1'), while A uses years in the program (e.g., '5th year')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries consistently use numeric course level thresholds (e.g., '500 or higher') while A uses qualitative descriptors (e.g., 'high-level or harder undergraduate')",
        "Dataset B focuses exclusively on professors rather than including broader terms like 'teachers' or 'faculty employees' found in A",
        "Dataset B contains repeated queries about program enrollment counts ('currently in the program') that don't appear in A",
        "Dataset B includes questions about temporal professor experience ('more than 5 years in the program') not present in A",
        "Dataset A contains ranking operations ('top 3 professors') while B focuses on binary thresholds ('more than one professor')",
        "Dataset B references student-to-student advising relationships ('advised by at least one other student') not found in A",
        "Dataset A specifies concrete year numbers ('5th year') while B uses phase-based program divisions ('phase 2')",
        "Dataset B queries about course enrollment numbers ('students enrolled in each course') which A never addresses",
        "Dataset A contains percentage calculations ('percentage of high-level courses') absent from B's aggregation queries",
        "Dataset B uses explicit level classifications ('Level_100', 'graduate level') where A uses relative difficulty descriptions ('basic or medium')"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries asking for professor or student names (e.g., 'Professor Jane'), while A only uses numerical IDs for identification",
        "Queries in B sometimes reference professors by name ('Professor Smith') rather than only numerical IDs like in A",
        "B contains questions about personal attributes/status (e.g., 'has no position') not found in A's academic role-focused queries",
        "Dataset B includes simpler, single-attribute requests (e.g., 'What is the course level...') without aggregations or comparisons common in A",
        "B lacks complex percentage calculations and top-N rankings that appear frequently in A's queries",
        "Questions in B never use comparative phrases like 'more than' or range specifications (e.g., 'from 40 to 50') that are common in A",
        "B includes existence checks ('Which professor has advised a student?') without quantitative requirements that A consistently uses",
        "Dataset B queries never mention faculty employment status or faculty-specific filters that are prominent in A",
        "B contains no queries about academic year distributions or student cohort analysis that A frequently asks about",
        "Dataset B lacks multi-condition course level specifications (e.g., 'basic or medium undergraduate courses') used in A's queries"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B questions use explicit 'courseLevel' attributes (e.g., Level_400) rather than qualitative descriptors like 'basic/medium/high-level' in Dataset A",
        "Dataset B includes temporal conditions about professor tenure (e.g., 'teaching for more than 2 years') absent in Dataset A",
        "Dataset B contains questions about professor experience thresholds (e.g., 'more than 0 years') not found in Dataset A",
        "Dataset B exclusively uses prefix-based ID notations like 'p_id' and 'professor_id' instead of Dataset A's simpler 'ID' or 'teacher no.' formats",
        "Dataset B lacks questions involving student-advisor relationships that are prominent in Dataset A",
        "Dataset B omits numerical aggregations (e.g., percentages, top-N rankings) present in Dataset A questions",
        "Dataset B questions never reference student years in programs or academic progression metrics seen in Dataset A",
        "Dataset B includes explicit references to person roles in IDs (e.g., 'professor 150') while Dataset A uses generic numerical identifiers",
        "Dataset B questions focus solely on professor-course relationships without Dataset A's additional faculty employment status filters",
        "Dataset B lacks conditional comparisons between entities (e.g., 'course no.16 vs course no.18') present in Dataset A"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries that reference professors by name (e.g., 'professor John'), while Dataset A exclusively uses numeric IDs for entities.",
        "Dataset B introduces course-related phases (e.g., 'beginning phase') not present in Dataset A's queries.",
        "Dataset A explicitly involves advisor-student relationships (e.g., 'students being advised by advisors'), whereas Dataset B focuses more narrowly on professor-course relationships.",
        "Dataset B repeats similar numerical thresholds across questions (e.g., 'more than 5 years') with less variation than Dataset A's threshold usage.",
        "Dataset A includes percentage calculations (e.g., 'percentage of high-level courses'), while Dataset B does not require percentage-based operations.",
        "Dataset B contains queries about students teaching courses (e.g., 'students in their 2nd year of the program'), a concept absent in Dataset A.",
        "Dataset A explicitly requests ranked lists (e.g., 'top 3 professors'), while Dataset B typically asks for singular top performers (e.g., 'professor with the most courses').",
        "Dataset B directly references specific attribute names like 'courseLevel' in questions, whereas Dataset A embeds attribute logic implicitly in filters.",
        "Dataset A explicitly verifies faculty employment status (e.g., 'faculty employee professors'), while Dataset B infers status indirectly via program duration thresholds.",
        "Dataset B's queries often use simpler conditional logic (e.g., single filters like 'yearsInProgram > 3'), whereas Dataset A frequently combines multiple conditions with AND/OR operators."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B questions explicitly reference database schema elements (e.g., 'courseLevel', 'p_id', 'inPhase') while Dataset A uses natural language terms without schema-specific column names.",
        "Dataset B includes explicit requests for SQL code (e.g., 'Write a SQLite query') whereas Dataset A only poses analytical questions without code generation requirements.",
        "Dataset A frequently combines multiple conditions in filters (e.g., 'high-level or harder undergraduate courses') while B typically uses single criteria per query.",
        "Dataset A contains queries requiring percentage calculations and top-N rankings (e.g., 'top 3 professors') which are absent in Dataset B's simpler count operations.",
        "Dataset B shows repetitive query patterns (e.g., multiple variations of 'courses taught by professors') while Dataset A demonstrates more unique question structures.",
        "Dataset A specifies academic role distinctions like 'faculty employee professors' whereas B uses undifferentiated 'professor' references.",
        "Dataset A includes comparative queries (e.g., 'course no.16 or course no.18') while B lacks direct comparison operations.",
        "Dataset A utilizes ID ranges (e.g., 'person IDs from 40 to 50') whereas B only queries single specific IDs.",
        "Dataset B references database structure elements (e.g., 'person table') while A remains abstracted from schema implementation details.",
        "Dataset A emphasizes complex academic relationships (e.g., 'advised students in 5th year') while B's student queries focus on simpler existence checks."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about user payment method status (e.g., 'users who have a payment method') while Dataset A focuses exclusively on subscription types (paying/trial)",
        "Dataset B contains explicit requests for sorting results in descending order of metrics (e.g., 'descending order of followers') while Dataset A only specifies top-ranked items without explicit sort directives",
        "Dataset B queries about list membership contradictions (e.g., 'lists that a user created and is not a follower of') which doesn't appear in Dataset A",
        "Dataset B contains substring matching requirements for list titles (e.g., 'title contains the word 's'') while Dataset A uses exact title matches",
        "Dataset A requires retrieval of specific media URLs (profile images, cover images, rating URLs) while Dataset B contains no media URL requests",
        "Dataset B calculates percentages based on payment method status combined with other attributes, while Dataset A calculates percentages based purely on rating scores/subscription status",
        "Dataset B includes queries about average follower counts (e.g., 'average number of followers') while Dataset A only uses absolute follower count thresholds",
        "Dataset A asks for movie release years and specific release date information while Dataset B's temporal constraints focus exclusively on creation/update timestamps",
        "Dataset B contains theme-based movie categorization through list titles (e.g., 'ballet movies') while Dataset A references lists only by exact titles or creator attributes",
        "Dataset A includes requests for list descriptions and cover images while Dataset B focuses exclusively on list metadata (titles, followers, creation dates)"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B's questions exclusively request single attributes (e.g., title OR year) per query, while A often combines multiple attributes (e.g., average rating AND release year) in a single question",
        "Dataset B never references user subscription statuses (e.g., paying subscriber, trial eligibility) in filters, whereas A consistently uses these as conditional criteria",
        "Dataset B only uses year-based temporal filters (e.g., 'released after 2000'), while A frequently specifies exact dates (e.g., '4/19/2020') in conditions",
        "Dataset B never requests percentage calculations, while A regularly includes percentage-based metrics (e.g., 'percentage of users gave \"5\"')",
        "Dataset B sometimes references explicit database table names (e.g., 'lists' table), while A never mentions underlying database structures",
        "Dataset B's URL requests only target movie resources, while A consistently requests user-specific URLs (profile images, rating pages with likes)",
        "Dataset B never asks about list descriptions or user profile images, while A frequently queries these metadata elements",
        "Dataset B uses single-layer conditional filters (e.g., '> 100 followers'), while A consistently combines multiple conditions (e.g., 'paying subscriber AND eligible trial user AND score \u2264 2')",
        "Dataset B never references user interactions like comment counts or rating likes, while A regularly includes these social engagement metrics",
        "Dataset B shows structural repetition (e.g., multiple variants of 'highest rating score' queries), while A maintains unique question structures across samples"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B consistently references 'payment method' as a user attribute in filters, whereas Dataset A distinguishes between 'paying subscriber' and 'trial eligible' statuses explicitly.",
        "Dataset B includes queries requiring substring matches in list titles (e.g., 'title contains the word Avengers'), while Dataset A does not use partial string matching in titles.",
        "Dataset B explicitly requests list identifiers (e.g., 'indicate the list_id') in results, whereas Dataset A focuses on list titles or attributes without requiring IDs.",
        "Dataset B uses decade-based temporal filters for movie release years (e.g., 'released in the 90s'), while Dataset A specifies exact years or dates.",
        "Dataset B frequently asks for ranked results with explicit numeric limits (e.g., 'top 3', 'top 5'), whereas Dataset A uses comparative terms like 'most popular' or 'longest period' without fixed thresholds.",
        "Dataset B includes nested aggregation conditions (e.g., 'average number of movies in lists that also have a rating score of 5'), while Dataset A focuses on single-layer aggregations like counts or averages.",
        "Dataset B lacks queries about user-generated media URLs (e.g., profile images, rating URLs), which are prominent in Dataset A.",
        "Dataset B requires director-specific qualifications involving quantitative career metrics (e.g., 'directed at least 10 movies between 1960\u20131985'), whereas Dataset A uses simpler director-name filters.",
        "Dataset B omits percentage-based calculations (e.g., 'percentage of users gave 5'), which appear in Dataset A.",
        "Dataset B does not ask for temporal durations (e.g., 'how long it has not been updated'), focusing instead on binary temporal checks (e.g., 'updated in the last month')."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B questions focus on overall database metrics (e.g., total movies, unique directors) rather than user-specific attributes like subscription status or trial eligibility.",
        "Dataset B includes questions about movie genres and categories, which are absent in Dataset A.",
        "Dataset B uses explicit numerical thresholds (e.g., 'greater than 4.5', '8 out of 10') without contextual conditions (e.g., user eligibility), unlike Dataset A.",
        "Dataset B frequently asks for counts of unique entities (e.g., unique users, unique directors) not emphasized in Dataset A.",
        "Dataset B lacks references to URLs (e.g., profile images, rating pages) present in Dataset A.",
        "Dataset B often requests 'top N' rankings (e.g., 'top 5 most popular movies') without requiring comparisons between specific entities, unlike Dataset A's direct comparisons (e.g., 'how much higher is X than Y').",
        "Dataset B includes questions about database schema (e.g., 'how many different genres') absent in Dataset A.",
        "Dataset B omits time-based user statuses (e.g., 'eligible for trial when he rated') present in Dataset A's conditional filters.",
        "Dataset B simplifies aggregation to basic counts/averages (e.g., 'how many movies released in 2020'), whereas Dataset A combines aggregations with user metadata (e.g., 'indicate whether the user was a paying subscriber').",
        "Dataset B does not require multi-part answers (e.g., 'state how long it has not been updated') common in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries focus on single-attribute retrieval (e.g., title/ID only) without multi-part answers, unlike A's compound requests",
        "Dataset B lacks questions involving URLs/profiles/images/descriptions present in A's metadata requirements",
        "Dataset B contains no queries about user subscription status (e.g., paying/trial eligibility) prevalent in A",
        "Dataset B questions never request percentage calculations seen in A's metrics (e.g., 'percentage of users gave 5')",
        "Dataset B excludes temporal duration comparisons (e.g., 'how long since update') that A frequently requires",
        "Dataset B has no queries combining multiple attributes in one result (e.g., 'average score AND release year' as seen in A)",
        "Dataset B never references relationships between user-generated content types (e.g., 'lists created by raters') present in A",
        "Dataset B questions are structurally simpler, avoiding A's nested conditions (e.g., 'rated X while being Y subscriber')",
        "Dataset B lacks hierarchical ranking requests (e.g., 'top 3 movies by comment counts') found in A's queries",
        "Dataset B shows no examples of verification requests (e.g., 'was eligible? indicate followers') present in A"
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B queries more frequently request single attributes (e.g., titles) without requiring multi-part answers, whereas Dataset A often demands combined results (e.g., averages paired with release years).",
        "Dataset B explicitly references database structures like 'ratings table' or 'lists table' in questions, while Dataset A avoids direct table references.",
        "Dataset A includes questions about user-generated media assets (e.g., profile images, cover images, URLs), which are absent in Dataset B.",
        "Dataset B uses relative time constraints (e.g., 'last year'), while Dataset A relies on absolute dates (e.g., '4/19/2020').",
        "Dataset B focuses on 'top N' formatted results (e.g., 'top 5 movies'), whereas Dataset A uses rankings (e.g., 'most popular') without explicit numbered rankings.",
        "Dataset A combines multiple conditional attributes (e.g., 'paying subscriber AND trial eligible') in filters, while Dataset B uses simpler conditions (e.g., 'has a payment method').",
        "Dataset B includes theme-based queries (e.g., 'horror movies', 'Christianity as their theme'), whereas Dataset A references genres only in basic metadata contexts.",
        "Dataset A requires indicating user/subscriber status alongside results (e.g., 'Indicate whether the user was a paying subscriber'), which Dataset B omits.",
        "Dataset B asks for positional data (e.g., 'first movie in the list'), a pattern absent in Dataset A.",
        "Dataset A frequently uses comparative phrasing (e.g., 'how much higher', 'difference between'), while Dataset B focuses on absolute extremes (e.g., 'highest', 'lowest')."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Queries in B explicitly reference database table names (e.g., 'lists', 'Ratings'), while A avoids table-specific terminology.",
        "B uses generic user identifiers like 'user1' instead of numeric user IDs (e.g., 'user 4208563') as seen in A.",
        "B repeats identical questions (e.g., 'highest rating') verbatim, whereas A avoids redundancy in phrasing.",
        "B includes structural queries about the database itself (e.g., 'list titles of all the lists in the database'), which A never does.",
        "B\u2019s top-N requests (e.g., 'top 3') lack contextual filters like time ranges or user statuses that A consistently includes.",
        "B focuses on table-specific existence checks (e.g., 'in the lists_users table') rather than A\u2019s cross-entity joins (e.g., 'movies rated by list creators').",
        "B omits requests for URLs or metadata (e.g., profile images, rating pages) that are prevalent in A.",
        "B employs single-condition filters (e.g., 'has a trial subscription') without combining multiple criteria like A\u2019s 'paying subscribers with profile pics'.",
        "B\u2019s aggregated metrics (e.g., 'average rating of Horror genre') lack granularity compared to A\u2019s director/year-specific aggregations (e.g., 'movies directed by Christopher Nolan').",
        "B uses broad time constraints (e.g., 'after 2021-01-01') without A\u2019s precise timestamps (e.g., 'on 4/19/2020') or duration calculations (e.g., 'how long it has not been updated')."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Queries in dataset B frequently repeat identical or near-identical questions (e.g., 'What is the title of the movie with the highest rating score?' appears 5 times), while dataset A has unique phrasing for every query.",
        "Dataset B never references URLs (profile images, cover images, or rating URLs), whereas dataset A explicitly requests URLs in multiple queries.",
        "Dataset B doesn't include questions about user-generated content interactions like comment counts or likes on ratings, which are present in dataset A.",
        "Dataset B lacks multi-part questions requiring simultaneous answers (e.g., 'Indicate how many followers...'), while dataset A often combines multiple requirements in a single query.",
        "Dataset B uses generic identifiers like 'specific user' or 'a user with a specific user_id' instead of concrete IDs (e.g., 'user 4208563') as seen in dataset A.",
        "Dataset B doesn't include time-granular date constraints (e.g., 'on 4/19/2020'), only year-based filters, unlike dataset A's precise date references.",
        "Dataset B never asks about trial eligibility status during actions (e.g., 'eligible for trial when he rated'), a recurring theme in dataset A.",
        "Dataset B excludes questions about list descriptions or metadata beyond titles/followers, while dataset A explicitly requests descriptions and update timelines.",
        "Dataset B doesn't require percentage calculations (e.g., 'percentage of users gave \"5\"'), which are present in dataset A.",
        "Dataset B lacks references to movie list creators' subscriber status during list creation, a filtering criterion used in dataset A."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B uses uppercase category names (e.g., 'FAMILY', 'TOOLS') while A uses lowercase genre descriptors (e.g., 'dating application', 'racing genre')",
        "B explicitly references app size metrics (e.g., 'size <50M') while A never mentions physical app size characteristics",
        "B uses the term 'type' to specify pricing models (e.g., 'free type') while A simply uses 'free' as a filter without categorical labeling",
        "B queries about content ratings (e.g., 'content rating of Everyone') while A references age groups through informal descriptors (e.g., 'Teen content rating')",
        "B employs more varied install count thresholds (10,000 vs 1M vs 10M) while A primarily uses 5,000+ and 100M+ install filters",
        "B includes percentage calculations based on install counts (e.g., 'percentage of games with 100k+ installs') while A calculates percentages based on sentiment types and user demographics",
        "B never requests translated reviews or specific user comments while A frequently asks for translated reviews and comment analysis",
        "B's sentiment analysis focuses on binary positive/negative classification ('positive sentiment reviews') while A uses continuous polarity scores and mentions neutral comments",
        "B structures queries around formal category hierarchies (e.g., 'HOUSE_AND_HOME') while A uses informal genre groupings (e.g., 'role playing game genre')",
        "B combines content ratings with technical specifications (e.g., 'content rating + install count + size') while A combines sentiment metrics with user demographics and specific app features"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B does not reference sentiment polarity or subjectivity scores in queries, whereas Dataset A explicitly includes them.",
        "Dataset B does not involve translated reviews in its questions, while Dataset A frequently requests translated reviews.",
        "Dataset B does not query app version information, unlike Dataset A, which includes version-specific criteria.",
        "Dataset B uses review counts (e.g., 10K+ reviews) as thresholds, while Dataset A uses install counts (e.g., 100M+ installs).",
        "Dataset B focuses on free status (e.g., 'Free' apps), whereas Dataset A queries price attributes (e.g., 'most expensive app').",
        "Dataset B does not reference content ratings (e.g., 'Teen'), while Dataset A includes age-group targeting and content ratings.",
        "Dataset B does not explicitly request counts of negative or neutral sentiments, unlike Dataset A, which directly queries these metrics.",
        "Dataset B does not calculate percentage ratios between sentiment categories (e.g., positive vs. negative), while Dataset A includes such computations.",
        "Dataset B includes queries about apps with 'None' ratings (e.g., missing values), which are absent in Dataset A.",
        "Dataset B emphasizes category-wide averages and counts (e.g., 'average rating of FAMILY category'), whereas Dataset A combines ratings with sentiment analysis in more granular filters (e.g., 'sentiment polarity >0.5')."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B queries aggregate metrics across entire categories (e.g., 'average rating of apps from the Tools category') while A focuses on individual apps or narrow subsets.",
        "Dataset B uses higher install count thresholds (e.g., '10 million installs') compared to A's lower thresholds (e.g., '5,000+ installs').",
        "Dataset B combines three or more filters in a single query (e.g., 'rating + installs + genre + free status') whereas A typically combines two filters.",
        "Dataset B references app size (e.g., 'more than 100 MB') and update recency (e.g., 'updated since 2020'), which A does not.",
        "Dataset B lacks queries about neutral sentiment counts or percentage ratios between sentiment types (e.g., A's 'percentage ratio of positive sentiment reviews').",
        "Dataset B does not include demographic-specific filters (e.g., A's 'Teen' content rating targeting) in its questions.",
        "Dataset B prioritizes popularity metrics (e.g., 'most downloaded apps') as primary ranking criteria, while A emphasizes sentiment polarity scores.",
        "Dataset B uses comparative phrases like 'higher than 4.5' for ratings, whereas A uses exact values (e.g., '3.9 rating').",
        "Dataset B explicitly queries app categories as entities (e.g., 'top 5 categories with the highest average rating'), unlike A\u2019s genre-specific app lists.",
        "Dataset B omits granular sentiment analysis (e.g., A\u2019s 'neutral comments from weather apps') and focuses on average sentiment polarity instead."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries focus on SQL query generation (e.g., 'Write a SQL query...') while A uses natural language questions without explicit SQL syntax",
        "B emphasizes category-based filtering (e.g., 'Tools', 'Family') whereas A uses genre-based categorization (e.g., 'role-playing', 'racing')",
        "A includes explicit requests for translated reviews while B never mentions translation needs",
        "B consistently uses simple rating thresholds (>4, >4.5) while A employs more varied numerical filters (5,000+ installs, 75,000,000+ reviews)",
        "A contains questions about content rating segments ('Teen') that are absent in B",
        "B queries frequently combine category with review count thresholds (e.g., 'more than 100,000 reviews') while A combines install counts with sentiment metrics",
        "A includes questions about app versions and age targeting ('current version', 'age group') which B never references",
        "B focuses more on basic aggregations (average rating counts) while A requests complex sentiment calculations (polarity scores, percentage ratios between sentiments)",
        "A asks for specific review text analysis ('List all negative comments') while B only deals with quantitative sentiment counts",
        "B shows repetitive patterns focusing on category+rating combinations, whereas A demonstrates more diverse metric combinations (price+sentiment, installs+age groups)"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B does not include questions about sentiment polarity or subjectivity scores, which are present in all Dataset A samples.",
        "Dataset B questions do not reference translated user reviews, a feature present in multiple Dataset A samples.",
        "Dataset B includes explicit questions about app size (e.g., 'what is the size'), while Dataset A does not mention size metrics.",
        "Dataset B contains queries about total app count in the Play Store ('How many apps are available'), which never appears in Dataset A questions.",
        "Dataset B questions focus exclusively on positive review counts rather than percentage ratios between positive/negative sentiments seen in Dataset A.",
        "Dataset B lacks questions combining multiple sentiment metrics with demographic filters (e.g., 'Teen content rating apps') present in Dataset A.",
        "Dataset B uses standardized category names in all caps (e.g., 'FAMILY', 'BUSINESS') rather than lowercase genre names seen in Dataset A.",
        "Dataset B includes questions about minimum download thresholds ('minimum number of downloads') not present in Dataset A's install range filters.",
        "Dataset B does not contain questions about app versions or technical specifications ('current version') that appear in Dataset A samples.",
        "Dataset B questions aggregate metrics across entire categories more frequently than Dataset A, which focuses on genre-specific analysis."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B focuses exclusively on average rating calculations, while A includes additional metrics like sentiment polarity scores, subjectivity, and review analyses.",
        "Dataset B uses uppercase category names (e.g., 'FAMILY', 'MEDICAL') consistently, while A uses lowercase genre labels (e.g., 'racing genre').",
        "Dataset B specifies 'Play Store' in most queries, while A never mentions a specific app store platform.",
        "Dataset B queries frequently repeat requests for average ratings across multiple categories, while A maintains more diverse analytical objectives per query.",
        "Dataset B utilizes numeric rating thresholds (e.g., '3.0', '4.5') as direct filters, while A combines numeric ratings with qualitative sentiment descriptors (e.g., 'users dislike this app').",
        "Dataset B references larger installation thresholds (e.g., '1,000,000+ installs'), while A uses smaller thresholds like '5,000+ installs'.",
        "Dataset B includes specific category types not present in A (e.g., 'AUTO_AND_VEHICLES', 'BBW Dating'), while A focuses on genres like dating, education, and role-playing games.",
        "Dataset B queries never mention translated reviews, sentiment polarity scores, or review commentary analysis present in A's samples.",
        "Dataset B employs standardized category naming conventions (e.g., 'NEWS_AND_MAGAZINES') with underscores, while A uses plain language genre labels.",
        "Dataset B lacks percentage ratio calculations and sentiment comparisons (positive/negative/neutral) that are prevalent in A's queries."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B queries consistently reference 'PlayStore' as the platform, while Dataset A does not specify a platform.",
        "Dataset B focuses on aggregate average ratings across entire categories (e.g., 'average rating of all apps in Entertainment'), while A frequently examines individual app metrics alongside category analysis.",
        "Dataset B uses the term 'Android apps' specifically in queries, while A makes no platform-specific references.",
        "Dataset A contains queries about app versions and current versions (e.g., 'indicate the current version'), while B never mentions version information.",
        "Dataset A includes percentage-based ratio comparisons between sentiment types (positive/negative/neutral), while B only references sentiment counts/categories without ratios.",
        "Dataset A explicitly asks for content rating demographics (e.g., 'Teen' targeted apps), while B never references age groups or content ratings.",
        "Dataset A requires analysis of sentiment subjectivity scores and polarity scores as distinct metrics, while B only uses broad sentiment categories (positive/negative).",
        "Dataset A queries frequently request translated user reviews, while B never mentions translation of reviews.",
        "Dataset B emphasizes 'popularity' as a metric through install counts and review counts, while A focuses more on sentiment polarity correlations with other metrics.",
        "Dataset A contains queries about 'no comment reviews' and neutral sentiment analysis, while B only references positive/negative sentiment counts without neutral categorization."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries do not request translated reviews or comments, while Dataset A frequently asks for translated content (e.g., 'state the translated review').",
        "Dataset B focuses on simpler aggregations (e.g., 'average rating') without combining multiple metrics in a single query, unlike Dataset A which often combines metrics like 'average price and sentiment polarity'.",
        "Dataset B lacks complex conditional sentiment filtering (e.g., 'for people who dislikes the app pretty much') seen in Dataset A, using broader terms like 'positive sentiment' instead.",
        "Dataset B does not reference demographic attributes (e.g., 'Teen' content ratings, age groups) present in Dataset A queries.",
        "Dataset B queries rarely require multiple distinct outputs per question (e.g., 'rating and how many users dislike'), focusing on single-value results like counts or averages.",
        "Dataset B omits app-specific metadata such as 'current version' or pricing status (free/paid) included in Dataset A questions.",
        "Dataset B does not use percentage ratios (e.g., 'percentage ratio of positive sentiment reviews') common in Dataset A.",
        "Dataset B avoids negative extremes (e.g., 'worst rating') and focuses on positive metrics like 'highest rating', unlike Dataset A.",
        "Dataset B does not reference app install counts (e.g., '5,000+ installs') present in Dataset A queries.",
        "Dataset B uses repetitive, structurally identical questions (e.g., multiple variations of 'average rating in X category'), whereas Dataset A employs more varied phrasing and specificity."
      ]
    }
  },
  "diffs_real_from_synth": {
    "computer_student": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries asking for percentage-based aggregations (e.g., 'Calculate the percentage of high-level undergraduate course'), while A focuses only on absolute counts.",
        "Dataset B explicitly requires comparisons between two specific entities (e.g., 'course no.16 or course no.18'), whereas A uses general comparative phrases like 'which course has more teachers' without direct entity pairs.",
        "Dataset B uses numeric ranges in queries (e.g., 'person IDs from 40 to 50'), while A references individual IDs without ranges.",
        "Dataset B includes questions about majority distributions (e.g., 'which students\u2019 year...do the advisors advise the majority of'), a concept absent in A.",
        "Dataset B explicitly asks for lists combining multiple attributes in a single response (e.g., 'course level and list of person IDs'), while A typically requests singular attributes or counts.",
        "Dataset B contains queries about advisor-to-student ratios per program year (e.g., 'how many advisors are in charge of advising all the students in 1st year'), a granularity not seen in A.",
        "Dataset B references 'employing professor in faculty' as a distinct filter in advising contexts, whereas A only uses faculty status as a standalone filter.",
        "Dataset B includes explicit ranking requests (e.g., 'top 3 professors'), while A uses phrases like 'which faculty member has taught the most courses' without specifying ranking thresholds.",
        "Dataset B uses compound threshold conditions (e.g., 'no more than two high-level or harder courses'), whereas A applies simpler thresholds like 'more than 2 years'.",
        "Dataset B explicitly queries about teaching/advising overlap (e.g., 'teachers who have advised more than 4 others to teach'), while A separates teaching and advising constraints."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes questions requiring percentage calculations (e.g., 'Calculate the percentage of high-level undergraduate course') while A focuses only on absolute counts",
        "Dataset B explicitly queries about faculty employment status distinctions (e.g., 'Between the faculty employee professors...') whereas A does not mention faculty employment categories",
        "Dataset B contains comparative majority queries (e.g., 'which students' year... do the advisors advise the majority of') absent in A's samples",
        "Dataset B uses course difficulty descriptors ('basic/medium/high-level/harder undergraduate courses') instead of A's numbered level system ('Level_500')",
        "Dataset B includes explicit top-N ranking requests (e.g., 'top 3 professors that teaches the most courses') not found in A",
        "Dataset B requires multi-attribute listing in responses (e.g., 'List the course IDs and levels...') while A typically requests single attributes",
        "Dataset B contains questions about advisor distribution patterns across student years (e.g., 'advise the majority of') that A's simpler advising queries lack",
        "Dataset B explicitly compares specific course pairs (e.g., 'course no.16 or course no.18') using numeric identifiers, unlike A's general course comparisons",
        "Dataset B includes professional course categorization ('professional courses') not present in A's level-only course distinctions",
        "Dataset B queries about teaching/advising relationships between multiple role hierarchies simultaneously (e.g., 'advised student IDs and IDs of employing professor') while A handles these relationships separately"
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Queries in B explicitly ask for percentages (e.g., 'Calculate the percentage of high-level undergraduate course'), while A does not involve percentage calculations.",
        "B includes requests for ranked results (e.g., 'top 3 professors'), whereas A focuses on absolute counts or comparisons without ranking.",
        "B uses phrases like 'no.' (e.g., 'course no.11', 'teacher no.79') to reference numerical identifiers, while A uses 'ID' (e.g., 'professor id') consistently.",
        "B includes questions about majority distributions (e.g., 'which students' year...do the advisors advise the majority of?'), which A lacks.",
        "B explicitly references 'professional courses' as a category, while A does not mention this classification.",
        "B uses terms like 'in charge of advising' and 'employing professor in faculty' to specify employment roles, whereas A uses broader phrases like 'faculty affiliated position'.",
        "B includes queries with combined numerical ranges (e.g., 'person IDs from 40 to 50'), while A uses inequality-based ranges (e.g., 'greater than 10 and less than or equal to 15').",
        "B asks for direct comparisons between specific entities (e.g., 'course no.16 or course no.18'), while A focuses on general comparative quantities (e.g., 'which course has more teachers').",
        "B includes temporal criteria with extreme values (e.g., '12th years of program'), whereas A uses more common temporal ranges (e.g., '5th year').",
        "B uses verbs like 'Describe' to request attribute listings (e.g., 'Describe the course level and list of person IDs'), while A uses simpler phrasing like 'list' or 'state'."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes queries about ranking or top N results (e.g., 'top 3 professors'), while A does not.",
        "Dataset B explicitly references faculty employment status distinctions (e.g., 'faculty employees') in filtering criteria, whereas A uses broader role-based filters.",
        "Dataset B contains queries requiring combined numerical ranges (e.g., 'person IDs from 40 to 50'), while A uses only specific IDs or categorical filters.",
        "Dataset B uses the term 'teachers' interchangeably with professors (e.g., 'teacher no.79'), whereas A exclusively uses 'professors' for teaching roles.",
        "Dataset B includes queries about majority/plurality analysis (e.g., 'advisors advise the majority of'), while A focuses on simple counts.",
        "Dataset B explicitly references non-faculty professors (e.g., 'professor who is not a faculty member') as a distinct category, unlike A.",
        "Dataset B contains queries with compound output requirements (e.g., 'course level and list of person IDs'), while A requests singular data points per query.",
        "Dataset B uses more granular course level classifications (e.g., 'high-level or harder undergraduate courses'), whereas A uses simpler categorizations like 'graduate level'.",
        "Dataset B includes queries about advisor distribution patterns (e.g., 'advisors in charge of advising all students in 1st year'), while A focuses on basic advisor-student relationships.",
        "Dataset B employs threshold-based comparisons with explicit inequality constraints (e.g., 'no more than two'), whereas A uses simpler binary thresholds like 'more than one'."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries requiring comparative analysis (e.g., 'course no.16 or course no.18') while A does not.",
        "Dataset B contains requests for percentage calculations (e.g., 'Calculate the percentage of high-level undergraduate course') absent in A.",
        "Dataset B explicitly uses course difficulty categories (e.g., 'basic/medium/high-level') as filters, while A only references generic course levels.",
        "Dataset B includes multi-attribute return requirements (e.g., 'List the course IDs and levels') whereas A typically requests single attributes.",
        "Dataset B features queries about majority/distribution analysis (e.g., 'majority of' year groups) not present in A.",
        "Dataset B requires ranking/limiting results (e.g., 'top 3 professors') while A focuses on absolute counts/identifications.",
        "Dataset B uses numeric ranges in filters (e.g., 'person IDs from 40 to 50') whereas A uses only specific individual IDs.",
        "Dataset B includes explicit faculty employment status checks (e.g., 'professor who is not a faculty member') not emphasized in A.",
        "Dataset B contains multi-part conditional aggregates (e.g., 'teaches no more than two high-level... courses') while A uses simpler count conditions.",
        "Dataset B asks for relationship mappings between entities (e.g., 'IDs of employing professor in faculty') as combined outputs, unlike A's singular focus."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes questions requiring aggregation (e.g., counts, percentages, totals), while A focuses on direct retrieval without calculations.",
        "Dataset B introduces advisor-student relationships (e.g., advisor IDs, student years in programs), whereas A does not reference advisors or student advising.",
        "Dataset B contains comparative queries (e.g., 'which course has more teachers'), while A lacks explicit comparisons between entities.",
        "Dataset B asks for ranked or top-N results (e.g., 'top 3 professors'), whereas A only retrieves unordered lists or single entities.",
        "Dataset B includes queries about faculty employment status (e.g., 'faculty employees'), while A does not categorize professors by employment type.",
        "Dataset B references student years in academic programs (e.g., '5th year', '12th years') as a distinct attribute, which A does not mention.",
        "Dataset B uses multi-criteria filters (e.g., 'basic or medium undergraduate courses taught by faculty'), whereas A typically filters by single attributes like course level or ID.",
        "Dataset B explicitly requests numerical outputs (e.g., 'how many', 'percentage'), while A primarily seeks entity identifiers or categorical values.",
        "Dataset B includes range-based queries (e.g., 'person IDs from 40 to 50'), while A uses only exact numerical matches for IDs.",
        "Dataset B involves hierarchical relationships (e.g., advisors \u2192 students \u2192 professors), whereas A focuses on flat professor-course or professor-level relationships."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries about the distribution of students across academic years (e.g., 'majority of students' year', '5th year students') while A focuses on professors' program duration thresholds.",
        "Dataset B explicitly asks for comparative analysis between entities (e.g., 'course no.16 or course no.18') whereas A focuses on absolute thresholds and rankings.",
        "Dataset B contains queries about hierarchical relationships (advisors\u2192students\u2192year in program) while A focuses on direct professor\u2192course relationships.",
        "Dataset B requires percentage calculations ('Calculate the percentage of high-level undergraduate course') whereas A only uses absolute counts.",
        "Dataset B includes queries about faculty employment status in combination with course levels ('faculty employees who teach basic courses'), while A verifies employment status independently.",
        "Dataset B uses plural entity groupings ('more than 4 people', 'taught by more than 4 people') where A focuses on individual professor thresholds.",
        "Dataset B explicitly references ID ranges ('person IDs from 40 to 50') while A queries specific single IDs or attributes.",
        "Dataset B asks for composite results (e.g., 'IDs and years in program', 'course IDs and levels') where A typically requests singular attributes.",
        "Dataset B includes meta-structural queries about teaching/advising allocation ('advisors in charge of all 1st year students'), absent in A.",
        "Dataset B uses broader terminology ('people', 'teachers', 'person IDs') where A consistently uses 'professor' as the entity type."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Queries in B frequently involve multi-part questions requiring combined results (e.g., 'List the professors who teach the course' alongside course levels).",
        "B includes explicit requests for ranking/ordering (e.g., 'top 3 professors that teaches the most courses').",
        "B contains queries with explicit percentage calculations (e.g., 'Calculate the percentage of high-level undergraduate course').",
        "B uses comparative phrases (e.g., 'more teachers, course no.16 or course no.18') not seen in A.",
        "B specifies numerical ranges in filters (e.g., 'person IDs from 40 to 50') instead of single IDs.",
        "B includes queries about 'faculty employee professors' as a distinct subcategory of professors.",
        "B requires filtering courses by compound difficulty levels (e.g., 'high-level or harder undergraduate courses').",
        "B asks for cross-referencing advisor-student relationships with teaching activities (e.g., 'advisor\u2019s courses taught').",
        "B explicitly requests enumeration of IDs alongside totals (e.g., 'List out all the course id').",
        "B includes conditional aggregates with role-based thresholds (e.g., 'advised more than 4 others to teach')."
      ]
    },
    "movie_platform": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes queries requesting URLs (profile images, rating pages) which are absent in Dataset A",
        "Dataset B contains questions about movie release years, whereas Dataset A does not reference release dates",
        "Dataset B asks for cover images/profile images (e.g. user profile or list cover art) which are never mentioned in Dataset A",
        "Dataset B requires retrieval of list/movie descriptions (e.g. 'description for the movie list') while Dataset A never references descriptions",
        "Dataset B includes questions about comment counts/interaction (e.g. 'number comments related to the critic') which are absent in Dataset A",
        "Dataset B specifies temporal eligibility states (e.g. 'eligible for trial when he rated') while Dataset A only uses static subscription statuses",
        "Dataset B calculates time durations since creation/update (e.g. 'how long the list has been created') whereas Dataset A only uses absolute timestamps",
        "Dataset B queries popularity metrics (e.g. 'average popularity', 'highest number of 5 rating scores') beyond simple averages/counts used in Dataset A",
        "Dataset B requires indication of user status during specific actions (e.g. 'was a paying subscriber when he created') while Dataset A uses current status only",
        "Dataset B references social engagement metrics (e.g. 'received 20 likes') that are never mentioned in Dataset A"
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Queries in dataset B require comparative analysis between specific entities (e.g., comparing average ratings of two movies), whereas dataset A focuses on singular entity metrics.",
        "Dataset B includes questions about user subscription status or trial eligibility (e.g., 'paying subscriber,' 'eligible for trial'), which are absent in dataset A.",
        "Dataset B explicitly requests percentage calculations (e.g., 'percentage of users gave \"5\"'), while dataset A uses only absolute counts or averages.",
        "Queries in dataset B reference user-generated content metadata like profile images, cover images, or rating-specific URLs with likes (e.g., 'URL to the rating ... that received 20 likes'), whereas dataset A references generic URLs.",
        "Dataset B involves temporal calculations (e.g., 'how long the list has been created,' 'how long it has not been updated'), whereas dataset A only retrieves timestamps without duration-based computations.",
        "Dataset B includes questions about list or movie descriptions (e.g., 'What's the description for the movie list'), which are not queried in dataset A.",
        "Queries in dataset B require conditional dependencies between entities (e.g., movies rated by users who created a specific list), while dataset A filters entities independently.",
        "Dataset B explicitly asks for user interaction metrics like comments and likes on ratings (e.g., 'movies with the most ratings,' 'critic received the highest amount of likes'), which are absent in dataset A.",
        "Dataset B includes multi-part answers (e.g., 'Indicate whether the user was a paying subscriber') or boolean responses (e.g., 'Was the user ... eligible for trial?'), whereas dataset A requests single-value outputs.",
        "Dataset B queries combine aggregation with nested conditions (e.g., 'average popularity of each movie by a director and which has the highest 5-rating counts'), while dataset A uses simpler aggregations (e.g., total, highest, average)."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes queries requesting URLs (e.g., user profile images, rating URLs), while A does not reference URLs.",
        "Dataset B explicitly asks for media assets like cover images/profile images (e.g., 'cover image of user'), absent in A.",
        "Dataset B requires indicating subscription status during list creation (e.g., 'was a paying subscriber when he created the list'), while A only filters by current status.",
        "Dataset B contains questions about comment metrics (e.g., 'number comments related to the critic'), which A never references.",
        "Dataset B includes percentage calculations (e.g., 'percentage of users gave \"5\"'), while A focuses on absolute counts/aggregates.",
        "Dataset B asks for temporal durations (e.g., 'how long it has not been updated'), whereas A uses fixed temporal filters without duration comparisons.",
        "Dataset B combines multiple output requirements in single answers (e.g., 'average popularity + which movie has highest 5-rating count'), while A requests singular metrics per query.",
        "Dataset B references list descriptions (e.g., 'description for the movie list'), a metadata field never mentioned in A.",
        "Dataset B explicitly tracks critic interactions (e.g., 'critic received the highest amount of likes'), absent in A's user/movie focus.",
        "Dataset B includes explicit comparisons between two specific movies (e.g., 'how much higher is X than Y'), while A only compares aggregated groups."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B includes explicit comparisons between specific, named movies (e.g., 'Innocence Unprotected' vs. 'When Will I Be Loved') rather than general rankings.",
        "Dataset B requires direct references to exact movie IDs (e.g., 'movie id 1269') and user IDs (e.g., 'user 4208563') in queries.",
        "Dataset B asks for URLs tied to specific user-movie interactions (e.g., profile images of users who rated a specific movie) rather than general external URLs.",
        "Dataset B involves percentage calculations (e.g., 'percentage of users gave \"5\"') instead of only basic averages or counts.",
        "Dataset B combines multiple eligibility criteria in single queries (e.g., 'paying subscriber AND eligible for trial when rating').",
        "Dataset B explicitly requests metadata descriptions (e.g., list descriptions, cover images) not referenced in Dataset A.",
        "Dataset B requires temporal duration calculations (e.g., 'how long the list has been created') rather than simple date filters.",
        "Dataset B includes questions that demand dual outputs (e.g., 'average rating score AND release year') in a single response.",
        "Dataset B specifies user-generated comment metrics (e.g., 'number comments related to the critic') as part of popularity criteria.",
        "Dataset B requires explicit status indications in results (e.g., 'Indicate whether the user was a paying subscriber') alongside numerical answers."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Queries in dataset B require retrieving URLs (e.g., user profile images, rating pages), absent in dataset A.",
        "Dataset B includes explicit references to user subscription statuses (e.g., 'paying subscriber,' 'eligible for trial') during actions like ratings or list creation.",
        "Dataset B requests media metadata (e.g., cover images, profile images) not present in dataset A.",
        "Queries in B often demand multi-part answers (e.g., average rating and release year), while A focuses on single attributes.",
        "Dataset B incorporates director-specific criteria (e.g., 'directed by Christopher Nolan'), unlike dataset A.",
        "B includes percentage-based calculations (e.g., '% of users gave \"5\"'), which A does not.",
        "Dataset B references user-generated comments and likes on ratings/critics, absent in A.",
        "B requires calculating temporal durations (e.g., 'how long since last update'), whereas A uses simple date filters.",
        "Queries in B mention platform-specific entities (e.g., 'Mubi'), while A\u2019s questions are platform-agnostic.",
        "Dataset B verifies user statuses at the time of actions (e.g., 'was a subscriber when created'), a feature absent in A."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B questions more frequently require comparative differences (e.g., 'how much higher') rather than absolute values like Dataset A",
        "Dataset B explicitly requests multimedia asset URLs (profile images, cover images, rating URLs) not mentioned in Dataset A",
        "Dataset B includes questions requiring dual outputs (e.g., 'state how long' + title) where Dataset A focuses on single-value responses",
        "Dataset B contains precise date constraints (e.g., '4/19/2020') while Dataset A uses relative timeframes (e.g., 'last year')",
        "Dataset B queries user engagement metrics beyond ratings (e.g., comment counts, likes on ratings) not present in Dataset A",
        "Dataset B requests metadata descriptions (e.g., list descriptions) not referenced in Dataset A questions",
        "Dataset B includes compound user status checks (e.g., 'paying subscriber AND eligible for trial') where Dataset A uses singular attributes",
        "Dataset B requires temporal duration calculations (e.g., 'how long created/not updated') while Dataset A focuses on timestamps",
        "Dataset B features multi-part aggregations (e.g., 'average rating AND release year') in single questions where Dataset A uses singular aggregations",
        "Dataset B asks for existence verification of attributes (e.g., 'was eligible for trial?') combined with quantitative metrics, unlike Dataset A"
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Queries in B require direct comparison of metrics between two specific entities (e.g., 'How much higher is X than Y') while A focuses on standalone metrics",
        "B explicitly requests URL retrieval for specific platform assets (profile images, rating pages) while A only references generic URLs/metadata",
        "B includes percentage-based calculations (e.g., 'percentage of users gave \"5\"') not found in A's aggregation patterns",
        "B contains queries requiring temporal duration calculations (e.g., 'how long the list has been created') beyond A's simple date comparisons",
        "B combines multiple distinct metrics in single requests (e.g., 'average rating AND release year') while A typically requests single metrics",
        "B specifically references external creative roles (directors) and production years, unlike A's general movie attributes",
        "B includes follow-up clarifications in questions (e.g., 'Indicate whether...') requiring conditional responses, absent in A's straightforward requests",
        "B requests descriptive metadata (e.g., list descriptions) beyond A's focus on numerical/status attributes",
        "B uses exact date ranges within years ('April 2020', 'year 2020') while A uses specific date thresholds without temporal grouping",
        "B explicitly references platform-specific branding (Mubi) in questions while A maintains platform-agnostic terminology"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries frequently request URLs (e.g., user profile images, cover images, or rating URLs), while A does not reference URLs.",
        "Dataset B includes questions about the number of 'likes' on ratings (e.g., 'received 20 likes'), which are absent in A.",
        "Dataset B explicitly asks for descriptions of movie lists (e.g., 'description for the movie list'), a feature not present in A.",
        "Dataset B references user-generated comments related to critics (e.g., 'number comments related to the critic'), whereas A lacks comment-related queries.",
        "Dataset B requires calculating durations (e.g., 'how long the list has been created') or time intervals, while A focuses only on date comparisons or thresholds.",
        "Dataset B includes hybrid responses (e.g., 'Indicate whether the user was a paying subscriber' alongside counts), whereas A\u2019s outputs are strictly numerical or categorical.",
        "Dataset B explicitly references exact user IDs (e.g., 'user 4208563') in queries, while A uses generic terms like 'specific user_id'.",
        "Dataset B asks about movie popularity metrics (e.g., 'average popularity of each movie'), a concept absent in A\u2019s rating-focused questions.",
        "Dataset B combines multiple attributes in single responses (e.g., 'average rating score and release year'), while A typically isolates attributes per query.",
        "Dataset B includes eligibility checks tied to specific historical actions (e.g., 'eligible for trial when he created the list'), whereas A filters by current user status."
      ]
    },
    "app_store": {
      "llama3.1-8b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B explicitly requests translated user reviews (e.g., 'state the translated review'), while A does not involve multilingual review data.",
        "Dataset B includes queries about demographic targeting (e.g., 'age group that the app is targeted at'), whereas A uses broad content ratings like 'Everyone' without age granularity.",
        "Dataset B specifically seeks worst-rated apps (e.g., 'worst rating'), while A focuses exclusively on thresholds like 'rating \u22654.0' without inverse extremes.",
        "Dataset B requires exact counts of sentiment categories (e.g., 'number of negative comments'), while A uses aggregated metrics like percentages or ratios for sentiments.",
        "Dataset B references app version details (e.g., 'current version') as part of results, which A never includes.",
        "Dataset B incorporates sentiment subjectivity scores (e.g., 'sentiment subjectivity \u22640.5'), whereas A only uses polarity scores.",
        "Dataset B combines app ratings with explicit user dislike counts (e.g., 'how many users dislike this App'), while A focuses on ratings without direct dislike metrics.",
        "Dataset B retrieves raw user comments (e.g., 'list all negative comments'), whereas A exclusively analyzes sentiment aggregates without exposing individual reviews.",
        "Dataset B uses qualitative sentiment descriptors (e.g., 'users pretty like this app') alongside numerical scores, while A relies strictly on quantitative polarity thresholds.",
        "Dataset B ties sentiment scores to specific genres in ranked results (e.g., 'top 3 genre for sentiment >0.5'), whereas A associates sentiment with categories without genre-level ranking."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_test-time-info_v1": [
        "Dataset B includes explicit requests for translated user reviews (e.g., 'state the translated review'), while A does not reference review content extraction.",
        "Dataset B queries frequently involve percentage ratios of sentiment categories (e.g., 'percentage ratio between positive and negative sentiments'), whereas A uses percentage ratios primarily for app count distributions.",
        "Dataset B contains specific references to app versions (e.g., 'current version') in queries, a feature absent in A.",
        "Dataset B explicitly requests age group targeting information (e.g., 'Indicate the age group'), while A only references content rating categories like 'Teen' without demographic specifics.",
        "Dataset B includes queries about precise install ranges (e.g., '5,000+ installs'), whereas A uses broader thresholds like '100M+ installs'.",
        "Dataset B combines sentiment polarity scores with numerical ratings in single queries (e.g., 'average sentiment polarity score... and its rating'), while A treats these as separate attributes.",
        "Dataset B explicitly requests worst/lowest ratings (e.g., 'worst rating'), while A focuses primarily on above-average rating thresholds.",
        "Dataset B queries frequently include dual sentiment metrics (e.g., 'how many users dislike this App' paired with ratings), whereas A typically pairs sentiment with single attributes like review counts.",
        "Dataset B specifies exact review sentiment thresholds (e.g., 'sentiment subjectivity of no more than 0.5'), while A uses qualitative terms like 'positive sentiment' without numerical bounds.",
        "Dataset B includes genre-specific sentiment analysis (e.g., 'top 3 genre for applications with sentiment review >0.5'), whereas A links genres directly to ratings without explicit sentiment thresholds."
      ],
      "llama3.1-8b_1000_few-shot_bg_v1": [
        "Dataset B includes queries requesting exact counts of apps meeting specific criteria (e.g., 'How many apps have a rating of 5?'), while Dataset A focuses on aggregated sums or averages (e.g., total installs).",
        "Dataset B explicitly references sentiment categories (positive, negative, neutral) in counts or ratios, whereas Dataset A uses sentiment polarity scores for analysis.",
        "Dataset B requires the return of app versions or update metadata (e.g., 'current version') in results, which Dataset A does not specify.",
        "Dataset B includes queries targeting individual apps by name (e.g., 'Dragon Ball Legends'), while Dataset A queries are category/genre-focused.",
        "Dataset B combines price metrics with other attributes (e.g., 'most expensive app's sentiment'), whereas Dataset A does not merge price with sentiment analysis.",
        "Dataset B asks for minimum or lowest values (e.g., 'worst rating'), whereas Dataset A exclusively queries maximums or top-ranked results.",
        "Dataset B includes demographic targeting (e.g., 'Teen' age groups) in filters, absent in Dataset A.",
        "Dataset B explicitly requests translated reviews or user comments as part of outputs (e.g., 'state the translated review'), while Dataset A may mention reviews but does not require their inclusion.",
        "Dataset B calculates percentage ratios between sentiment categories (e.g., 'positive vs. negative'), whereas Dataset A uses averages or overall percentages.",
        "Dataset B queries review subjectivity metrics (e.g., 'sentiment subjectivity \u2264 0.5'), a dimension not present in Dataset A."
      ],
      "qwen2.5-coder-7b_1000_few-shot_bg_v1": [
        "Dataset B queries explicitly request translated user reviews (e.g., 'state the translated review'), while A never mentions review content extraction",
        "Dataset B contains questions about app version information (e.g., 'indicate the current version'), a dimension absent in Dataset A",
        "Dataset B includes demographic targeting criteria (e.g., 'age group', 'Teen content rating') not present in Dataset A queries",
        "Dataset B specifies exact rating matches (e.g., 'rating is 3.9', 'have 5 rating') while A typically uses comparison operators (> 4.5)",
        "Dataset B queries combine sentiment metrics with financial aspects (e.g., 'most expensive app', 'average price') unlike A",
        "Dataset B requires identification of worst/best extremes (e.g., 'worst rating', 'lowest sentiment polarity') while A focuses on positive thresholds",
        "Dataset B includes percentage ratio calculations between sentiment categories (e.g., 'percentage ratio between positive and negative') where A uses absolute counts",
        "Dataset B references specific review subjectivity thresholds (e.g., 'sentiment subjectivity of no more than 0.5') not found in A",
        "Dataset B queries explicitly filter by installation ranges (e.g., 'only 5,000+ installs') while A uses 'more than' thresholds",
        "Dataset B contains questions about app content suitability (e.g., 'suitable for teenagers') as a filter criterion, absent in A"
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about sentiment subjectivity scores, which are absent in Dataset A (e.g., 'sentiment subjectivity of no more than 0.5').",
        "Dataset B explicitly asks for percentage-based calculations (e.g., 'percentage ratio of positive sentiment reviews'), while Dataset A focuses on absolute counts or averages.",
        "Dataset B requires listing ranked results (e.g., 'top 3 genre', 'top 5 shopping apps'), whereas Dataset A primarily asks for singular extremes (e.g., 'highest rating').",
        "Dataset B combines sentiment polarity scores with other metrics in the same query (e.g., 'average rating and percentage ratio of positive sentiment'), while Dataset A treats sentiment as a standalone metric.",
        "Dataset B queries often include conditional sentiment analysis (e.g., 'reviews with sentiment review greater than 0.5'), unlike Dataset A's binary positive/negative classification.",
        "Dataset B explicitly requests translated reviews as part of output results (e.g., 'state the translated review'), while Dataset A only references translated reviews as a filter criterion.",
        "Dataset B includes queries about app versions (e.g., 'current version'), which are never mentioned in Dataset A.",
        "Dataset B uses comparative phrases like 'worst rating' paired with sentiment analysis, while Dataset A uses simpler extremes like 'lowest-rated apps' without sentiment context.",
        "Dataset B contains queries about sentiment polarity scores for specific user segments (e.g., 'people who dislike the app pretty much'), adding granular sentiment filtering absent in Dataset A.",
        "Dataset B frequently combines demographic targeting with sentiment metrics (e.g., 'targeted to teens and their average sentiment polarity'), while Dataset A treats demographics as standalone filters."
      ],
      "llama3.1-8b_1000_zero-shot_bg_test-time-info_v1": [
        "Dataset B includes queries about sentiment polarity scores (e.g., 'sentiment polarity score', 'positive/negative sentiment ratio'), which are absent in Dataset A.",
        "Dataset B explicitly requests translated user reviews (e.g., 'state the translated review'), while Dataset A never references review text or translations.",
        "Dataset B contains queries about sentiment subjectivity scores (e.g., 'sentiment subjectivity of no more than 0.5'), a metric not present in Dataset A.",
        "Dataset B requires percentage calculations for sentiment distributions (e.g., 'percentage ratio of positive sentiment reviews'), unlike Dataset A which focuses on absolute values like average ratings.",
        "Dataset B includes version-specific app data (e.g., 'current version'), while Dataset A queries never reference app versions.",
        "Dataset B combines sentiment metrics with demographic targeting (e.g., 'indicate the age group targeted'), whereas Dataset A only uses broad content ratings like 'Teen'.",
        "Dataset B explicitly asks for worst-rated extremes (e.g., 'app with the worst rating'), while Dataset A focuses on averages, thresholds, or top-rated apps.",
        "Dataset B queries often request multiple distinct metrics in a single question (e.g., 'rating + sentiment polarity + genre'), whereas Dataset A questions typically focus on one primary metric with filters.",
        "Dataset B references review comment classifications (e.g., 'negative comments', 'neutral comments'), while Dataset A only deals with numerical ratings and install counts.",
        "Dataset B includes queries about sales performance (e.g., 'best selling app'), a dimension absent in Dataset A which focuses purely on ratings and install metrics."
      ],
      "llama3.1-8b_1000_zero-shot_bg_v1": [
        "Dataset B includes queries requesting exact counts of apps meeting specific rating thresholds (e.g., 'How many apps have rating of 5?'), while A focuses on averages and thresholds without explicit count requests.",
        "Dataset B incorporates percentage-based calculations (e.g., 'What percentage of no comment reviews...'), whereas A primarily uses absolute metrics like averages or install counts.",
        "Dataset B explicitly queries about demographic targeting (e.g., 'age group,' 'Teen content rating'), while A lacks references to user demographics or content ratings.",
        "Dataset B requires retrieval of translated user reviews for specific conditions (e.g., 'state the translated review'), whereas A only references sentiment polarity without direct review extraction.",
        "Dataset B includes queries about app versions (e.g., 'current version'), which are absent in A's samples.",
        "Dataset B combines sentiment analysis with non-sentiment metrics in single queries (e.g., 'average rating... and percentage ratio of positive sentiment reviews'), while A separates these into distinct queries.",
        "Dataset B explicitly asks for minimums/worst-case metrics (e.g., 'worst rating'), whereas A focuses on averages, top performers, or thresholds above a value.",
        "Dataset B references sentiment subjectivity scores (e.g., 'sentiment subjectivity of no more than 0.5'), while A only uses polarity scores or sentiment categories.",
        "Dataset B includes queries about free apps as a distinct category (e.g., 'List all free sports Apps'), whereas A treats price as a numerical filter without explicit 'free' classification.",
        "Dataset B requires direct enumeration of negative user comments (e.g., 'List all the negative comments'), while A quantifies sentiment through scores/categories without raw text retrieval."
      ],
      "qwen2.5-coder-7b_1000_zero-shot_bg_v1": [
        "Dataset B queries explicitly request translated user reviews (e.g., 'state the translated review'), while A does not reference review text content.",
        "Dataset B includes demographic targeting parameters (e.g., 'Teen' content rating, 'age group') absent in Dataset A.",
        "Dataset B requires percentage calculations (e.g., 'percentage ratio of positive sentiment reviews') while A focuses on absolute counts or averages.",
        "Dataset B incorporates monetary metrics (e.g., 'most expensive app', 'average price') not present in Dataset A queries.",
        "Dataset B combines sentiment analysis with cost/price metrics in single queries (e.g., 'total Sentiment polarity score of the most expensive app').",
        "Dataset B specifies version information requirements (e.g., 'current version') for apps, unlike Dataset A.",
        "Dataset B frequently requests multiple distinct metrics per query (e.g., 'rating + dislike count + translated review') while A typically asks for single metrics.",
        "Dataset B references app pricing models (e.g., 'free apps') and install types ('free apps') explicitly, unlike Dataset A.",
        "Dataset B analyzes sentiment subjectivity scores (e.g., 'sentiment subjectivity of no more than 0.5') in addition to polarity used in both datasets.",
        "Dataset B requires identification of specific user attitudes (e.g., 'users pretty like this app') beyond simple positive/negative classifications used in A."
      ]
    }
  }
}