[
  {
    "question": "How many zip codes in the dataset have multiple counties associated with them?",
    "code": "# Count the number of zip codes in zip_data where multi_county is 'Yes'\nmulti_county_count = zip_data[zip_data['multi_county'] == 'Yes']['zip_code'].nunique()\nprint(multi_county_count)",
    "database": "address",
    "choices": [
      "52",
      "29",
      "14",
      "12"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    },
    "annotation": "count"
  },
  {
    "question": "Which zip code has both the highest population and belongs to the CBSA with the largest population?",
    "code": "# Find the CBSA with the largest population by summing populations in zip_data.\n# Then, find the zip code with the highest population within that CBSA.\ncbsa_populations = zip_data.groupby('CBSA')['population_2020'].sum()\nlargest_cbsa = cbsa_populations.idxmax()\nzip_codes_in_largest_cbsa = zip_data[zip_data['CBSA'] == largest_cbsa]\nzip_code_with_highest_population = zip_codes_in_largest_cbsa.loc[zip_codes_in_largest_cbsa['population_2020'].idxmax()]['zip_code']\nprint(zip_code_with_highest_population)",
    "database": "address",
    "choices": [
      "17603",
      "11367",
      "5401",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    },
    "annotation": "0 ambugous in question's \"population\" and row \"population_2020\""
  },
  {
    "question": "What is the total land area covered by the districts represented by Democrats?",
    "code": "total_land_area = congress[congress['party'] == 'Democrat']['land_area'].sum()\nprint(total_land_area)",
    "database": "address",
    "choices": [
      "200652.38999999998",
      "142079.06",
      "58313.18",
      "381236.23"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    },
    "annotation": "sum"
  },
  {
    "question": "What was the average arrival delay for flights operated by Envoy Air?",
    "code": "envoy_airline_code = Air_Carriers[Air_Carriers['Description'].str.contains(\"Envoy Air\")]['Code'].values[0]\nenvoy_flight_delays = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == envoy_airline_code]['ARR_DELAY'].dropna()\naverage_delay = envoy_flight_delays.mean()\nprint(average_delay)",
    "database": "airline",
    "choices": [
      "-0.65",
      "-2.7142857142857144",
      "-5.75",
      "1.441860465116279"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "What is the cancellation rate of flights for United Air Lines Inc.?",
    "code": "united_airlines_code = Air_Carriers[Air_Carriers['Description'].str.contains(\"United Air Lines Inc.\")]['Code'].values[0]\nunited_flights = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == united_airlines_code]\ncancellation_rate = united_flights['CANCELLED'].mean() * 100\nprint(cancellation_rate)",
    "database": "airline",
    "choices": [
      "4.166666666666666",
      "2.083333333333333",
      "2.197802197802198",
      "7.142857142857142"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    },
    "annotation": "average"
  },
  {
    "question": "What was the destination airport with the highest average arrival delay for United Air Lines Inc.?",
    "code": "united_airline_id = Air_Carriers.loc[Air_Carriers['Description'].str.contains('United Air Lines Inc.:'), 'Code'].values[0]\nunited_flights = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == united_airline_id]\naverage_arrival_delay_by_airport = united_flights.groupby('DEST')['ARR_DELAY'].mean()\ntop_destination_code = average_arrival_delay_by_airport.idxmax()\ntop_destination_name = Airports.loc[Airports['Code'] == top_destination_code, 'Description'].values[0]\nprint(top_destination_name)",
    "database": "airline",
    "choices": [
      "Fort Lauderdale, FL: Fort Lauderdale-Hollywood International",
      "El Paso, TX: El Paso International",
      "Nashville, TN: Nashville International",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "Which flight number had the largest difference in scheduled versus actual elapsed time?",
    "code": "Airlines['ELAPSED_TIME_DIFF'] = (Airlines['CRS_ELAPSED_TIME'] - Airlines['ACTUAL_ELAPSED_TIME']).abs()\nmax_difference_flight_num = Airlines.loc[Airlines['ELAPSED_TIME_DIFF'].idxmax(), 'OP_CARRIER_FL_NUM']\nprint(max_difference_flight_num)",
    "database": "airline",
    "choices": [
      "124",
      "147",
      "2278",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    },
    "annotation": "item"
  },
  {
    "question": "Which airport had the highest number of flights arriving from Southwest Airlines Co. (code 19393)?",
    "code": "sw_airlines_flights = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == 19393]\ndest_flight_counts = sw_airlines_flights['DEST'].value_counts()\nhighest_count_airport = dest_flight_counts.idxmax()\nairport_description = Airports.loc[Airports['Code'] == highest_count_airport, 'Description'].values[0]\nprint(airport_description)",
    "database": "airline",
    "choices": [
      "Las Vegas, NV: McCarran International",
      "Dallas, TX: Dallas Love Field",
      "Chicago, IL: Chicago Midway International",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    },
    "annotation": "item"
  },
  {
    "question": "What is the average arrival delay for Delta Air Lines Inc. flights?",
    "code": "# Join Airlines and Air_Carriers tables to filter by Delta Air Lines Inc.\ndelta_id = Air_Carriers.loc[Air_Carriers['Description'].str.contains('Delta Air Lines Inc.'), 'Code'].values[0]\n\n# Filter for Delta Air Lines Inc. flights and calculate the average arrival delay.\ndelta_flights = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == delta_id]\naverage_arrival_delay_delta = delta_flights['ARR_DELAY'].mean()\n\nprint(average_arrival_delay_delta)",
    "database": "airline",
    "choices": [
      "3.7950819672131146",
      "-1.9090909090909092",
      "4.214285714285714",
      "3.5517241379310347"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "Which destination airport has the longest average arrival delay for flights operated by Envoy Air, and what is its description?",
    "code": "# Get the airline ID for Envoy Air.\nenvoy_air_id = Air_Carriers.loc[Air_Carriers['Description'].str.contains('Envoy Air'), 'Code'].values[0]\n\n# Filter flights operated by Envoy Air.\nenvoy_air_flights = Airlines[Airlines['OP_CARRIER_AIRLINE_ID'] == envoy_air_id]\n\n# Group by destination airport and calculate the average arrival delay.\naverage_arrival_delay_by_dest = envoy_air_flights.groupby('DEST')['ARR_DELAY'].mean()\n\n# Get the airport code with the longest average delay.\nlongest_delay_dest_code = average_arrival_delay_by_dest.idxmax()\n\n# Get the description of the airport using the Airports table.\nlongest_delay_dest_description = Airports.loc[Airports['Code'] == longest_delay_dest_code, 'Description'].values[0]\n\nprint(longest_delay_dest_description)",
    "database": "airline",
    "choices": [
      "Marquette, MI: Sawyer International",
      "Rochester, NY: Greater Rochester International",
      "Grand Rapids, MI: Gerald R. Ford International",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    },
    "annotation": "0 2 questions hybird"
  },
  {
    "question": "How many unique apps are there in the \"playstore\" table that also have user reviews in the \"user_reviews\" table?",
    "code": "unique_apps_with_reviews = set(playstore['App']).intersection(set(user_reviews['App']))\nresult = len(unique_apps_with_reviews)\nprint(result)",
    "database": "app_store",
    "choices": [
      "111",
      "588",
      "366",
      "204"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    },
    "annotation": "count"
  },
  {
    "question": "What is the average rating of apps in the \"playstore\" table that have positive reviews?",
    "code": "apps_with_positive_reviews = user_reviews[user_reviews['Sentiment'] == 'Positive']['App'].unique()\nfiltered_playstore = playstore[playstore['App'].isin(apps_with_positive_reviews)]\nresult = filtered_playstore['Rating'].mean()\nprint(result)",
    "database": "app_store",
    "choices": [
      "4.317123287671232",
      "4.328820960698691",
      "4.281944444444445",
      "4.32453531598513"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "How many unique apps have reviews with a sentiment labeled as 'Positive'?",
    "code": "# Filter user_reviews for entries with positive sentiment\npositive_reviews = user_reviews[user_reviews['Sentiment'] == 'Positive']\n\n# Merge with playstore to ensure valid apps from the playstore\npositive_apps = positive_reviews.merge(playstore, on='App')\n\n# Count the unique apps in these positive reviews\nunique_positive_apps = positive_apps['App'].nunique()\n\nprint(unique_positive_apps)",
    "database": "app_store",
    "choices": [
      "254",
      "146",
      "43",
      "78"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "What is the total number of reviews for free apps that have received at least one neutral review?",
    "code": "# Get apps with at least one neutral review\nneutral_review_apps = user_reviews[user_reviews['Sentiment'] == 'Neutral']['App']\n\n# Get free apps from playstore\nfree_apps = playstore[playstore['Type'] == 'Free']['App']\n\n# Find apps that are free and have at least one neutral review\nfree_neutral_apps = set(neutral_review_apps).intersection(free_apps)\n\n# Get total reviews for these apps from playstore\ntotal_reviews = playstore[playstore['App'].isin(free_neutral_apps)]['Reviews'].sum()\n\nprint(total_reviews)",
    "database": "app_store",
    "choices": [
      "3477657",
      "19897499",
      "198536622",
      "4911258"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    },
    "annotation": "sum"
  },
  {
    "question": "What is the average rating for apps that have at least one negative sentiment review?",
    "code": "# Merge playstore and user_reviews on the 'App' field.\nmerged_data = playstore.merge(user_reviews, on='App')\n\n# Filter for apps with negative sentiment reviews\nnegative_reviews = merged_data[merged_data['Sentiment'] == 'Negative']\n\n# Calculate the average rating for apps with negative sentiment reviews - drop duplicates to consider each app only once\naverage_rating = negative_reviews[['App', 'Rating']].drop_duplicates()['Rating'].mean()\n\nprint(average_rating)",
    "database": "app_store",
    "choices": [
      "4.247058823529412",
      "4.280645161290322",
      "4.236065573770492",
      "4.220183486238532"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    },
    "annotation": "average"
  },
  {
    "question": "What is the total number of reviews for apps in the 'BOOKS_AND_REFERENCE' category that have received positive sentiment reviews?",
    "code": "# Merge playstore and user_reviews on the 'App' field\nmerged_data = playstore.merge(user_reviews, on='App')\n\n# Filter for apps in the 'BOOKS_AND_REFERENCE' category with positive sentiment reviews\npositive_reviews_books = merged_data[\n    (merged_data['Category'] == 'BOOKS_AND_REFERENCE') & \n    (merged_data['Sentiment'] == 'Positive')\n]\n\n# Sum the number of reviews for these apps, removing duplicates to sum by unique apps\ntotal_reviews_books_positive = positive_reviews_books[['App', 'Reviews']].drop_duplicates()['Reviews'].sum()\n\nprint(total_reviews_books_positive)",
    "database": "app_store",
    "choices": [
      "4509115",
      "2068420",
      "246315",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    },
    "annotation": "sum"
  },
  {
    "question": "What is the genre with the highest average rating among apps that have negative sentiment reviews?",
    "code": "# Merge playstore and user_reviews on the 'App' field\nmerged_data = playstore.merge(user_reviews, on='App')\n\n# Filter for apps with negative sentiment reviews\nnegative_reviews = merged_data[merged_data['Sentiment'] == 'Negative']\n\n# Calculate the average rating per Genre for these apps\ngenre_average_rating = negative_reviews[['Genres', 'Rating']].drop_duplicates().groupby('Genres')['Rating'].mean()\n\n# Find the genre with the highest average rating\nhighest_average_genre = genre_average_rating.idxmax()\n\nprint(highest_average_genre)",
    "database": "app_store",
    "choices": [
      "Food & Drink",
      "Medical",
      "Board",
      "Action"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    },
    "annotation": "item"
  },
  {
    "question": "How many unique affiliations have authors with papers listed in this dataset?",
    "code": "# Use the PaperAuthor table to get all unique affiliations, excluding empty ones\nunique_affiliations = PaperAuthor[PaperAuthor['Affiliation'].notnull()]['Affiliation'].nunique()\n\nprint(unique_affiliations)",
    "database": "authors",
    "choices": [
      "452",
      "90",
      "166",
      "45"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    },
    "annotation": "count"
  },
  {
    "question": "How many unique authors have published papers in journals?",
    "code": "merged_df = PaperAuthor.merge(Paper, how='inner', left_on='PaperId', right_on='Id')\npublished_in_journals = merged_df[merged_df['JournalId'] != 0]\nunique_authors = published_in_journals['AuthorId'].nunique()\nprint(unique_authors)",
    "database": "authors",
    "choices": [
      "326",
      "950",
      "78",
      "165"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "What is the name of the conference with the highest number of papers in the dataset?",
    "code": "conference_papers = Paper[Paper['ConferenceId'] != 0]\nmost_papers_conference_id = conference_papers['ConferenceId'].value_counts().idxmax()\nmost_papers_conference_name = Conference[Conference['Id'] == most_papers_conference_id]['FullName'].iloc[0]\nprint(most_papers_conference_name)",
    "database": "authors",
    "choices": [
      "IEEE International Conference on Micro Electro Mechanical Systems",
      "Storage and Retrieval for Image and Video Databases",
      "International Conference on Robotics and Automation",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    },
    "annotation": "item"
  },
  {
    "question": "How many papers in the dataset are co-authored by authors with different affiliations?",
    "code": "co_authored_papers = PaperAuthor.groupby('PaperId').filter(lambda x: x['Affiliation'].nunique() > 1)['PaperId'].nunique()\nprint(co_authored_papers)",
    "database": "authors",
    "choices": [
      "0",
      "1",
      "4",
      "24"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    },
    "annotation": "count"
  },
  {
    "question": "How many active customers are there based on their address status?",
    "code": "active_status_id = address_status[address_status['address_status'] == 'Active']['status_id'].values[0]\nactive_customer_ids = customer_address[customer_address['status_id'] == active_status_id]['customer_id']\nactive_customers_count = customer[customer['customer_id'].isin(active_customer_ids)].shape[0]\nprint(active_customers_count)",
    "database": "books",
    "choices": [
      "407",
      "214",
      "106",
      "52"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    },
    "annotation": "count"
  },
  {
    "question": "How many customers have placed orders with 'Pending Delivery' status?",
    "code": "pending_status_id = order_status[order_status['status_value'] == 'Pending Delivery']['status_id'].values[0]\npending_orders = order_history[order_history['status_id'] == pending_status_id]['order_id']\npending_customers = cust_order[cust_order['order_id'].isin(pending_orders)]['customer_id'].nunique()\nprint(pending_customers)",
    "database": "books",
    "choices": [
      "72",
      "17",
      "35",
      "157"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    },
    "annotation": "count"
  },
  {
    "question": "What is the total number of books published by \"University of Chicago Press\"?",
    "code": "ucp_books = book[book['publisher_id'] == publisher[publisher['publisher_name'] == 'University of Chicago Press']['publisher_id'].iloc[0]]\ntotal_ucp_books = len(ucp_books)\nprint(total_ucp_books)",
    "database": "books",
    "choices": [
      "4",
      "5",
      "1",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "How many customers have an \"Active\" address status?",
    "code": "active_status_id = address_status[address_status['address_status'] == 'Active']['status_id'].iloc[0]\nactive_customers = customer_address[customer_address['status_id'] == active_status_id]\ntotal_active_customers = len(active_customers)\nprint(total_active_customers)",
    "database": "books",
    "choices": [
      "438",
      "107",
      "221",
      "52"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    },
    "annotation": "count"
  },
  {
    "question": "Which author has the most books in this dataset?",
    "code": "joined_books_authors = book.merge(book_author, on='book_id')\nbook_count_per_author = joined_books_authors['author_id'].value_counts()\nauthor_with_most_books_id = book_count_per_author.idxmax()\nauthor_with_most_books_name = author[author['author_id'] == author_with_most_books_id]['author_name'].iloc[0]\nprint(author_with_most_books_name)",
    "database": "books",
    "choices": [
      "Dick Francis",
      "Agatha Christie",
      "Bob Dylan",
      "Piers Anthony"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "Which author has the most books written in English?",
    "code": "english_language_id = book_language[book_language['language_name'] == 'English']['language_id'].values[0]\nenglish_books = book[book['language_id'] == english_language_id]['book_id']\nauthors_books = book_author[book_author['book_id'].isin(english_books)]\nmost_authored = authors_books['author_id'].mode().iloc[0]\nauthor_name = author[author['author_id'] == most_authored]['author_name'].values[0]\nprint(author_name)",
    "database": "books",
    "choices": [
      "Agatha Christie",
      "Bob Dylan",
      "Alan Greenspan",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 1,
      "16k": 2
    },
    "annotation": "item"
  },
  {
    "question": "What is the average number of pages for books published by Del Rey Books?",
    "code": "del_rey_id = publisher[publisher['publisher_name'] == 'Del Rey Books']['publisher_id'].values[0]\ndel_rey_books = book[book['publisher_id'] == del_rey_id]\naverage_pages = del_rey_books['num_pages'].mean()\nprint(average_pages)",
    "database": "books",
    "choices": [
      "384.0",
      "397.5",
      "394.5",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    },
    "annotation": "average"
  },
  {
    "question": "How many customers have an active address in Canada?",
    "code": "active_status_id = address_status[address_status['address_status'] == 'Active']['status_id'].iloc[0]\ncanadian_country_id = country[country['country_name'] == 'Canada']['country_id'].iloc[0]\ncanadian_addresses = address[address['country_id'] == canadian_country_id]['address_id']\nactive_customers_in_canada = customer_address[(customer_address['address_id'].isin(canadian_addresses)) & \n                                              (customer_address['status_id'] == active_status_id)]['customer_id'].nunique()\nprint(active_customers_in_canada)",
    "database": "books",
    "choices": [
      "5",
      "3",
      "1",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "Which publisher has published the most books?",
    "code": "book_count_per_publisher = book['publisher_id'].value_counts()\nmost_published_publisher_id = book_count_per_publisher.idxmax()\nmost_published_publisher_name = publisher[publisher['publisher_id'] == most_published_publisher_id]['publisher_name'].iloc[0]\nprint(most_published_publisher_name)",
    "database": "books",
    "choices": [
      "Vintage",
      "Penguin Books",
      "VIZ Media",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 1,
      "32k": 2,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "How many orders are currently in \"Pending Delivery\" status?",
    "code": "pending_delivery_status_id = order_status[order_status['status_value'] == 'Pending Delivery']['status_id'].iloc[0]\npending_orders = order_history[order_history['status_id'] == pending_delivery_status_id]['order_id'].nunique()\nprint(pending_orders)",
    "database": "books",
    "choices": [
      "75",
      "36",
      "17",
      "163"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "How many schools have both FRPM data and SAT scores data?",
    "code": "# Merge FRPM and SAT scores data on school codes to determine common schools\ncommon_schools = frpm[['CDSCode']].merge(satscores, left_on='CDSCode', right_on='cds', how='inner')\n\n# Count the number of unique schools with data in both tables\nnum_common_schools = common_schools['CDSCode'].nunique()\n\nprint(num_common_schools)",
    "database": "california_schools",
    "choices": [
      "0",
      "1",
      "2",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 1,
      "16k": 0
    },
    "annotation": "count"
  },
  {
    "question": "What is the average enrollment for schools that are listed in both the FRPM and school data?",
    "code": "# Merge the FRPM and schools table to filter schools present in both tables\nmerged_enrollment_data = frpm[['CDSCode', 'Enrollment (K-12)']].merge(schools, left_on='CDSCode', right_on='CDSCode', how='inner')\n\n# Calculate average enrollment of these schools\naverage_enrollment = merged_enrollment_data['Enrollment (K-12)'].mean()\n\nprint(average_enrollment)",
    "database": "california_schools",
    "choices": [
      "630.421875",
      "631.5703125",
      "815.53125",
      "671.15625"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "What is the total enrollment for schools with more than 50% of students eligible for free or reduced-price meals located in Los Angeles County?",
    "code": "# Merge the `schools` and `frpm` tables on `CDSCode`.\nmerged_df = schools.merge(frpm, on='CDSCode')\n\n# Filter schools in Los Angeles County and with Percent (%) Eligible FRPM > 0.5\nfiltered_df = merged_df[(merged_df['County'] == \"Los Angeles\") & \n                        (merged_df['Percent (%) Eligible FRPM (K-12)'] > 0.5)]\n\n# Calculate the total enrollment for the filtered schools.\ntotal_enrollment = filtered_df['Enrollment (K-12)'].sum()\n\n# Output the result\nprint(total_enrollment)",
    "database": "california_schools",
    "choices": [
      "16047.0",
      "10256.0",
      "35582.0",
      "22499.0"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 0,
      "16k": 1
    },
    "annotation": "sum"
  },
  {
    "question": "How many schools are directly funded charter schools?",
    "code": "# Filter directly funded charter schools from the 'schools' DataFrame.\ndirectly_funded_charter_schools = schools[(schools['Charter'] == 1) & (schools['FundingType'] == 'Directly funded')]\n\n# Count the number of directly funded charter schools.\nnum_directly_funded_charter_schools = directly_funded_charter_schools.shape[0]\n\nprint(num_directly_funded_charter_schools)",
    "database": "california_schools",
    "choices": [
      "6",
      "15",
      "4",
      "11"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "What is the total enrollment for all schools in Los Angeles County that have SAT scores?",
    "code": "merged_data = schools.merge(satscores, left_on='CDSCode', right_on='cds')\nla_schools = merged_data[merged_data['County'] == \"Los Angeles\"]\ntotal_enrollment = la_schools['enroll12'].sum()\nprint(total_enrollment)",
    "database": "california_schools",
    "choices": [
      "685",
      "1295",
      "0",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 0,
      "16k": 2
    },
    "annotation": "sum"
  },
  {
    "question": "What is the most common release date among the sets that have cards with the keyword \"Trample\"?",
    "code": "# Merge cards and sets on setCode\ncards_sets_merged = cards.merge(sets, left_on='setCode', right_on='code')\n\n# Filter cards that have the keyword \"Trample\"\ntrample_cards = cards[cards['keywords'].str.contains('Trample', na=False)]\n\n# Merge trample cards with sets to get release dates\ntrample_sets = trample_cards.merge(sets, left_on='setCode', right_on='code')\n\n# Find the most common release date among these sets\nmost_common_release_date = trample_sets['releaseDate'].mode()[0]\nprint(most_common_release_date)",
    "database": "card_games",
    "choices": [
      "2015-07-17",
      "2004-02-06",
      "2019-05-03",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    },
    "annotation": "item"
  },
  {
    "question": "How many unique languages are there in which cards with the rarity \"rare\" have been translated?",
    "code": "# Filter cards with rarity \"rare\"\nrare_cards = cards[cards['rarity'] == 'rare']\n\n# Merge rare cards with foreign_data on uuid to find translations\nrare_foreign_data = rare_cards.merge(foreign_data, on='uuid')\n\n# Count unique languages\nunique_languages_count = rare_foreign_data['language'].nunique()\nprint(unique_languages_count)",
    "database": "card_games",
    "choices": [
      "4",
      "3",
      "9",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    },
    "annotation": "count"
  },
  {
    "question": "What is the average converted mana cost of cards that have been printed in sets released in the year 2019?",
    "code": "# Filter sets released in 2019\nsets_2019 = sets[sets['releaseDate'].str.startswith('2019')]\n\n# Join cards with sets to find cards from 2019\ncards_2019 = cards.merge(sets_2019, left_on='setCode', right_on='code')\n\n# Calculate the average converted mana cost\naverage_converted_mana_cost = cards_2019['convertedManaCost'].mean()\nprint(average_converted_mana_cost)",
    "database": "card_games",
    "choices": [
      "4.0",
      "3.1666666666666665",
      "5.0",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "How many cards share the same uuid across different formats specified in the legalities table?",
    "code": "unique_cards_in_legalities = legalities['uuid'].nunique()\nprint(unique_cards_in_legalities)",
    "database": "card_games",
    "choices": [
      "15",
      "24",
      "64",
      "128"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    },
    "annotation": "count"
  },
  {
    "question": "What is the most common language for the foreign cards available in the dataset?",
    "code": "common_language = foreign_data['language'].mode().iloc[0]\nprint(common_language)",
    "database": "card_games",
    "choices": [
      "Chinese Traditional",
      "Italian",
      "French",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 1,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "Which is the earliest released set that contains cards with foreign translations?",
    "code": "sets_with_translations = set_translations['setCode'].unique()\nearly_set = sets[sets['code'].isin(sets_with_translations)]['releaseDate'].min()\nearly_set_name = sets.loc[sets['releaseDate'] == early_set, 'name'].values[0]\nprint(early_set_name)",
    "database": "card_games",
    "choices": [
      "Seventh Edition",
      "Fourth Edition",
      "Homelands",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "How many cards have foreign translations available in French according to the \"foreign_data\" table?",
    "code": "french_cards_uuids = foreign_data[foreign_data['language'] == \"French\"]['uuid'].unique()\nfrench_translations_count = cards[cards['uuid'].isin(french_cards_uuids)]['name'].nunique()\nprint(french_translations_count)",
    "database": "card_games",
    "choices": [
      "23",
      "10",
      "4",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    },
    "annotation": "count"
  },
  {
    "question": "How many crimes were reported in the district with the district number 3?",
    "code": "crimes_in_district_3 = Crime[Crime['district_no'] == 3].shape[0]\nprint(crimes_in_district_3)",
    "database": "chicago_crime",
    "choices": [
      "36",
      "4",
      "15",
      "59"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    },
    "annotation": "count"
  },
  {
    "question": "Which neighborhood has the most crimes reported in it?",
    "code": "most_crime_area_no = Crime['community_area_no'].mode()[0]\nmost_crime_neighborhood = Neighborhood[Neighborhood['community_area_no'] == most_crime_area_no]['neighborhood_name'].values[0]\nprint(most_crime_neighborhood)",
    "database": "chicago_crime",
    "choices": [
      "Humboldt Park",
      "Galewood",
      "Gold Coast",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 1,
      "32k": 2,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "What is the email of the alderman for the ward where the most drug abuse crimes were reported?",
    "code": "drug_abuse_fbi_code_no = FBI_Code[FBI_Code['title'] == 'Drug Abuse']['fbi_code_no'].values[0]\nward_with_most_drug_abuse = Crime[Crime['fbi_code_no'] == drug_abuse_fbi_code_no]['ward_no'].mode()[0]\nalderman_email = Ward[Ward['ward_no'] == ward_with_most_drug_abuse]['ward_email'].values[0]\nprint(alderman_email)",
    "database": "chicago_crime",
    "choices": [
      "ward02@cityofchicago.org",
      "ward27@cityofchicago.org",
      "ward24@cityofchicago.org",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 1,
      "16k": 0
    },
    "annotation": "item"
  },
  {
    "question": "Which police district commander manages the largest population within their district?",
    "code": "district_populations = Ward.groupby('ward_no')['Population'].sum().reset_index()\nlargest_population_ward = district_populations.loc[district_populations['Population'].idxmax(), 'ward_no']\ndistrict_no_for_ward = Crime[Crime['ward_no'] == largest_population_ward].iloc[0]['district_no']\ncommander_name = District[District['district_no'] == district_no_for_ward]['commander'].values[0]\nprint(commander_name)",
    "database": "chicago_crime",
    "choices": [
      "Brendan McCrudden",
      "Michelle A. Rubino",
      "Stephen C. Chung",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    },
    "annotation": "item"
  },
  {
    "question": "What is the name of the neighborhood with the most motor vehicle thefts?",
    "code": "motor_vehicle_iucr_no = IUCR.loc[IUCR['primary_description'] == 'MOTOR VEHICLE THEFT', 'iucr_no'].values[0]\nmotor_vehicle_crimes = Crime[Crime['iucr_no'] == motor_vehicle_iucr_no]\ncommunity_area_with_most_thefts = motor_vehicle_crimes['community_area_no'].value_counts().idxmax()\nneighborhood_name_with_most_thefts = Neighborhood.loc[Neighborhood['community_area_no'] == community_area_with_most_thefts, 'neighborhood_name'].values[0]\nprint(neighborhood_name_with_most_thefts)",
    "database": "chicago_crime",
    "choices": [
      "Loop",
      "Hermosa",
      "Jackson Park Highlands",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    },
    "annotation": "item"
  },
  {
    "question": "Which district has the highest number of reported crimes?",
    "code": "district_crime_counts = Crime['district_no'].value_counts()\nmost_crime_district = District[District['district_no'] == district_crime_counts.idxmax()]['district_name'].iloc[0]\nprint(most_crime_district)",
    "database": "chicago_crime",
    "choices": [
      "Grand Crossing",
      "Chicago Lawn",
      "Harrison",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 2,
      "16k": 1
    },
    "annotation": "item"
  },
  {
    "question": "What is the average population of the wards with reported vandalism crimes?",
    "code": "vandalism_fbi_code_no = FBI_Code[FBI_Code['title'] == 'Vandalism']['fbi_code_no'].iloc[0]\nvandalism_crimes = Crime[Crime['fbi_code_no'] == vandalism_fbi_code_no]\nvandalism_wards = vandalism_crimes['ward_no'].unique()\naverage_population_vandalism_wards = Ward[Ward['ward_no'].isin(vandalism_wards)]['Population'].mean()\nprint(average_population_vandalism_wards)",
    "database": "chicago_crime",
    "choices": [
      "53822.04347826087",
      "53557.294117647056",
      "54101.0",
      "53751.71428571428"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    },
    "annotation": "average"
  },
  {
    "question": "How many neighborhoods have reported crimes categorized under \"Persons\" according to the FBI code description?",
    "code": "persons_fbi_codes = FBI_Code[FBI_Code['crime_against'] == 'Persons']['fbi_code_no']\ncrime_persons = Crime[Crime['fbi_code_no'].isin(persons_fbi_codes)]\nneighborhoods_with_persons_crimes = crime_persons['community_area_no'].unique()\nnumber_of_neighborhoods = Neighborhood[Neighborhood['community_area_no'].isin(neighborhoods_with_persons_crimes)]['neighborhood_name'].nunique()\nprint(number_of_neighborhoods)",
    "database": "chicago_crime",
    "choices": [
      "156",
      "103",
      "208",
      "8"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the name of the community area with the highest number of arrests for 'NARCOTICS' related crimes?",
    "code": "narcotics_iucr_nos = IUCR[IUCR['primary_description'] == 'NARCOTICS']['iucr_no']\nnarcotics_crimes = Crime[Crime['iucr_no'].isin(narcotics_iucr_nos) & Crime['arrest']]\ncommunity_area_narcotics_counts = narcotics_crimes['community_area_no'].value_counts()\nhighest_narcotics_area_no = community_area_narcotics_counts.idxmax()\nhighest_narcotics_community_name = Community_Area[Community_Area['community_area_no'] == highest_narcotics_area_no]['community_area_name'].values[0]\nprint(highest_narcotics_community_name)",
    "database": "chicago_crime",
    "choices": [
      "North Lawndale",
      "Near North Side",
      "Humboldt Park",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many community areas do aldermen with a city hall office room ending in '0' represent?",
    "code": "aldermen_with_office = Ward[Ward['city_hall_office_room'] % 10 == 0]['ward_no']\ncommunity_areas = Crime[Crime['ward_no'].isin(aldermen_with_office)]['community_area_no'].unique()\nnumber_of_community_areas = len(community_areas)\nprint(number_of_community_areas)",
    "database": "chicago_crime",
    "choices": [
      "36",
      "73",
      "61",
      "71"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique papers cited in the `cites` table are classified as \"Agents\" in the `paper` table?",
    "code": "agents_papers = set(paper[paper['class_label'] == 'Agents']['paper_id'])\ncited_agents = len(cites[cites['cited_paper_id'].isin(agents_papers)]['cited_paper_id'].unique())\nprint(cited_agents)",
    "database": "citeseer",
    "choices": [
      "28",
      "77",
      "6",
      "217"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the number of unique words cited by papers classified as \"DB\" in the `paper` table?",
    "code": "db_papers = set(paper[paper['class_label'] == 'DB']['paper_id'])\nwords_cited_by_db = len(content[content['paper_id'].isin(db_papers)]['word_cited_id'].unique())\nprint(words_cited_by_db)",
    "database": "citeseer",
    "choices": [
      "74",
      "143",
      "568",
      "266"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many papers in the `paper` table are cited by others, according to the `cites` table?",
    "code": "cited_papers = len(set(cites['cited_paper_id']) & set(paper['paper_id']))\nprint(cited_papers)",
    "database": "citeseer",
    "choices": [
      "1275",
      "40",
      "125",
      "357"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many unique papers are both citing and cited but also have a class label of \"DB\" in the `paper` table?",
    "code": "db_papers = set(paper[paper['class_label'] == 'DB']['paper_id'])\nciting_and_cited = len(set(cites['cited_paper_id']).intersection(set(cites['citing_paper_id'])).intersection(db_papers))\nprint(citing_and_cited)",
    "database": "citeseer",
    "choices": [
      "133",
      "8",
      "24",
      "2"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of unique word_cited_id cited by papers that themselves cite other papers?",
    "code": "papers_citing_others = set(cites['citing_paper_id'])\nwords_cited_by_citers = len(content[content['paper_id'].isin(papers_citing_others)]['word_cited_id'].unique())\nprint(words_cited_by_citers)",
    "database": "citeseer",
    "choices": [
      "1216",
      "174",
      "40",
      "424"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many papers are cited more than once based on the combined information from the cites and paper tables?",
    "code": "cited_counts = cites['cited_paper_id'].value_counts()\ncited_more_than_once = cited_counts[cited_counts > 1]\nnum_cited_more_than_once = len(cited_more_than_once)\nprint(num_cited_more_than_once)",
    "database": "citeseer",
    "choices": [
      "788",
      "48",
      "121",
      "283"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of words associated with the papers that are being cited?",
    "code": "cited_papers = cites['cited_paper_id'].unique()\nwords_of_cited_papers = content[content['paper_id'].isin(cited_papers)]['word_cited_id'].nunique()\nprint(words_of_cited_papers)",
    "database": "citeseer",
    "choices": [
      "366",
      "42",
      "132",
      "1094"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many unique papers in the cites table belong to the ML class label in the paper table?",
    "code": "# Merge the cites table with the paper table on cited_paper_id and paper_id\ncited_papers_with_labels = cites.merge(paper, left_on='cited_paper_id', right_on='paper_id', how='inner')\n\n# Filter the merged DataFrame for papers with class_label 'ML'\nml_papers = cited_papers_with_labels[cited_papers_with_labels['class_label'] == 'ML']\n\n# Get the number of unique papers\nunique_ml_papers_count = ml_papers['cited_paper_id'].nunique()\n\nprint(unique_ml_papers_count)",
    "database": "citeseer",
    "choices": [
      "60",
      "6",
      "225",
      "17"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "Which word_cited_id appears most frequently in papers with the AI class label?",
    "code": "# Merge the paper table with the content table on paper_id\nmerged_paper_content = paper.merge(content, on='paper_id', how='inner')\n\n# Filter the merged DataFrame for papers with class_label 'AI'\nai_papers_content = merged_paper_content[merged_paper_content['class_label'] == 'AI']\n\n# Find the most common word_cited_id in AI papers\nmost_common_word = ai_papers_content['word_cited_id'].value_counts().idxmax()\n\nprint(most_common_word)",
    "database": "citeseer",
    "choices": [
      "word1798",
      "word817",
      "word2277",
      "word620"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many distinct citing papers are there for papers with the HCI class label in the paper table?",
    "code": "# Merge the cites table with the paper table on cited_paper_id and paper_id\nciting_papers_with_hci = cites.merge(paper, left_on='cited_paper_id', right_on='paper_id', how='inner')\n\n# Filter the merged DataFrame for papers with class_label 'HCI'\nhci_citing_papers = citing_papers_with_hci[citing_papers_with_hci['class_label'] == 'HCI']\n\n# Count the distinct citing papers\ndistinct_citing_papers_count = hci_citing_papers['citing_paper_id'].nunique()\n\nprint(distinct_citing_papers_count)",
    "database": "citeseer",
    "choices": [
      "60",
      "269",
      "6",
      "20"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many unique word_cited_ids are associated with papers from the 'IR' class label?",
    "code": "# Get paper_ids of 'IR' papers\nir_paper_ids = paper[paper['class_label'] == 'IR']['paper_id']\n\n# Count unique word_cited_ids associated with these IR papers\nunique_words_ir = content[content['paper_id'].isin(ir_paper_ids)]['word_cited_id'].nunique()\nprint(unique_words_ir)",
    "database": "citeseer",
    "choices": [
      "66",
      "242",
      "135",
      "538"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many papers have cited publications with the 'Agents' class label?",
    "code": "# Get paper_ids of 'Agents' papers\nagents_paper_ids = paper[paper['class_label'] == 'Agents']['paper_id']\n\n# Find citing papers that have cited these 'Agents' papers\ncited_agents_papers = cites[cites['cited_paper_id'].isin(agents_paper_ids)]['citing_paper_id'].nunique()\nprint(cited_agents_papers)",
    "database": "citeseer",
    "choices": [
      "125",
      "6",
      "30",
      "400"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many papers that are cited belong to the 'IR' class label?",
    "code": "# Merge the cites and paper DataFrames to find papers cited and their class labels\ncited_with_labels = cites.merge(paper, left_on='cited_paper_id', right_on='paper_id')\n\n# Count the number of papers with the class label 'IR'\nir_cited_count = cited_with_labels[cited_with_labels['class_label'] == 'IR'].shape[0]\n\nprint(ir_cited_count)",
    "database": "citeseer",
    "choices": [
      "38",
      "15",
      "134",
      "621"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many unique citing papers belong to the 'DB' class label?",
    "code": "# Merge the cites and paper DataFrames to identify the citing papers and their class labels\nciting_with_labels = cites.merge(paper, left_on='citing_paper_id', right_on='paper_id')\n\n# Count the number of unique citing papers with the class label 'DB'\nunique_db_citing_count = citing_with_labels[citing_with_labels['class_label'] == 'DB']['citing_paper_id'].nunique()\n\nprint(unique_db_citing_count)",
    "database": "citeseer",
    "choices": [
      "35",
      "312",
      "93",
      "8"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of words cited by papers in the 'Agents' class?",
    "code": "# Merge the content and paper DataFrames to find the word_cited_ids for Agents class papers\nagents_papers = paper[paper['class_label'] == 'Agents']\nagents_word_cited = content.merge(agents_papers, on='paper_id')\n\n# Count the total number of word citations by Agents class papers\ntotal_agents_word_cited = agents_word_cited['word_cited_id'].count()\n\nprint(total_agents_word_cited)",
    "database": "citeseer",
    "choices": [
      "756",
      "75",
      "151",
      "287"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "For which institution is the discrepancy between expected national awards and actual awards the largest in absolute terms?",
    "code": "# Calculate the discrepancy between expected national awards and actual awards\ninstitution_details['award_discrepancy'] = (institution_details['exp_award_natl_value'] - institution_details['awards_per_natl_value']).abs()\n\n# Find the institution with the largest discrepancy\nlargest_discrepancy_inst = institution_details.loc[institution_details['award_discrepancy'].idxmax()]['chronname']\n\nprint(largest_discrepancy_inst)",
    "database": "college_completion",
    "choices": [
      "Central Baptist College",
      "California College of the Arts",
      "Birmingham-Southern College",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of 4-year bachelor's graduates in New Mexico for the year 2013?",
    "code": "# Filter state_sector_grads for 4-year bachelor's graduates in New Mexico for the year 2013\nnm_graduates_2013 = state_sector_grads[(state_sector_grads['state'] == 'New Mexico') & (state_sector_grads['level'] == '4-year') & (state_sector_grads['year'] == 2013)]\n\n# Calculate the total number of graduates\ntotal_nm_graduates_2013 = nm_graduates_2013['grad_cohort'].sum()\nprint(total_nm_graduates_2013)",
    "database": "college_completion",
    "choices": [
      "87151",
      "871271513",
      "151",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the average aid percentile of institutions in states with more than 20 public schools offering 4-year programs?",
    "code": "public_schools_state_ids = state_sector_details[(state_sector_details['control'] == 'Public') & (state_sector_details['level'] == '4-year') & (state_sector_details['schools_count'] > 20)]['stateid']\ninstitutions_in_states = institution_details[institution_details['state'].isin(state_sector_details[state_sector_details['stateid'].isin(public_schools_state_ids)]['state'])]\naverage_aid_percentile = institutions_in_states['aid_percentile'].mean()\nprint(average_aid_percentile)",
    "database": "college_completion",
    "choices": [
      "49.0625",
      "47.02127659574468",
      "47.07692307692308",
      "50.0"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many institutions have an expected award state value greater than the average state appropriations value for private for-profit 4-year schools in New Jersey?",
    "code": "nj_state_appr_value = state_sector_details[(state_sector_details['control'] == 'Private for-profit') & (state_sector_details['level'] == '4-year') & (state_sector_details['state'] == 'New Jersey')]['state_appr_value'].mean()\ninstitutions_above_threshold = institution_details[institution_details['exp_award_state_value'] > nj_state_appr_value].shape[0]\nprint(institutions_above_threshold)",
    "database": "college_completion",
    "choices": [
      "46",
      "240",
      "22",
      "121"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of unique ingredients used across all recipes in the dataset?",
    "code": "unique_ingredients_count = Quantity['ingredient_id'].nunique()\nprint(unique_ingredients_count)",
    "database": "cookbook",
    "choices": [
      "42",
      "273",
      "80",
      "132"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which recipe has the highest total fat per serving as per the Nutrition table?",
    "code": "Recipe_Nutrition = Recipe.merge(Nutrition, on='recipe_id')\nhighest_fat_recipe = Recipe_Nutrition.loc[Recipe_Nutrition['total_fat'] == Recipe_Nutrition['total_fat'].max(), 'title'].iloc[0]\nprint(highest_fat_recipe)",
    "database": "cookbook",
    "choices": [
      "Lime-Basted Lamb Kabobs",
      "Ginger-Orange Barbecued Beef Back Ribs",
      "Tropical Stuffed Cornish Hens",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many recipes include 'margarine' as an ingredient?",
    "code": "margarine_id = Ingredient.loc[Ingredient['name'] == 'margarine', 'ingredient_id'].iloc[0]\nrecipes_with_margarine = Quantity[Quantity['ingredient_id'] == margarine_id]['recipe_id'].nunique()\nprint(recipes_with_margarine)",
    "database": "cookbook",
    "choices": [
      "4",
      "10",
      "5",
      "3"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the average protein content per serving across all recipes?",
    "code": "average_protein_per_serving = Nutrition['protein'].mean()\nprint(average_protein_per_serving)",
    "database": "cookbook",
    "choices": [
      "15.151067415730338",
      "16.74914893617021",
      "17.128666666666668",
      "15.278203592814371"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average sodium content in recipes that include the ingredient \"salt\"?",
    "code": "salt_id = Ingredient[Ingredient['name'] == 'salt']['ingredient_id'].iloc[0]\nrecipes_with_salt = Quantity[Quantity['ingredient_id'] == salt_id]['recipe_id']\naverage_sodium = Nutrition[Nutrition['recipe_id'].isin(recipes_with_salt)]['sodium'].mean()\nprint(average_sodium)",
    "database": "cookbook",
    "choices": [
      "529.0699999999999",
      "568.6804761904762",
      "441.6866666666667",
      "621.6618181818181"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average calorie content of recipes that include \"margarine\" as an ingredient?",
    "code": "# Merge on Ingredient tables to find the ingredient_id for \"margarine\"\nmargarine_id = Ingredient.loc[Ingredient['name'] == 'margarine', 'ingredient_id'].values[0]\n\n# Find recipe_ids that include margarine using the Quantity table\nmargarine_recipes = Quantity.loc[Quantity['ingredient_id'] == margarine_id, 'recipe_id']\n\n# Filter the Nutrition table for these recipes and calculate the average calorie content\naverage_calories = Nutrition[Nutrition['recipe_id'].isin(margarine_recipes)]['calories'].mean()\nprint(average_calories)",
    "database": "cookbook",
    "choices": [
      "184.916",
      "145.865",
      "155.61666666666667",
      "166.77800000000002"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many recipes list an ingredient from the \"fresh herbs\" category?",
    "code": "# Merge on Ingredient to find all ingredient_ids in the \"fresh herbs\" category\nfresh_herbs_ids = set(Ingredient.loc[Ingredient['category'] == 'fresh herbs', 'ingredient_id'])\n\n# Find distinct recipe_ids that include any of these ingredients in the Quantity table\nrecipes_with_fresh_herbs = Quantity[Quantity['ingredient_id'].isin(fresh_herbs_ids)]['recipe_id'].nunique()\nprint(recipes_with_fresh_herbs)",
    "database": "cookbook",
    "choices": [
      "4",
      "2",
      "15",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average iron content in recipes that include any baking product?",
    "code": "# Find ingredient_ids in the 'baking products' category\nbaking_products_ids = Ingredient.loc[Ingredient['category'] == 'baking products', 'ingredient_id']\n\n# Find recipe_ids using these baking products from the Quantity table\nrecipes_with_baking_products = Quantity[Quantity['ingredient_id'].isin(baking_products_ids)]['recipe_id'].drop_duplicates()\n\n# Calculate the average iron content for these recipes from the Nutrition table\naverage_iron = Nutrition[Nutrition['recipe_id'].isin(recipes_with_baking_products)]['iron'].mean()\nprint(average_iron)",
    "database": "cookbook",
    "choices": [
      "1.55",
      "1.6083333333333334",
      "1.598",
      "1.6076923076923075"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the total sum of calories for all recipes that include onion as an ingredient?",
    "code": "# Merge Quantity with Ingredient to find recipes with onion\nonion_recipes = Quantity.merge(Ingredient[Ingredient['name'] == 'onion'], on='ingredient_id')['recipe_id']\n\n# Filter Nutrition data for recipes with onion and calculate total calories\ntotal_onion_calories = Nutrition[Nutrition['recipe_id'].isin(onion_recipes)]['calories'].sum()\nprint(total_onion_calories)",
    "database": "cookbook",
    "choices": [
      "1129.7",
      "3090.42",
      "724.9300000000001",
      "1377.46"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the total cooking time (sum of prep, cook, and stand minutes) for the recipe with the highest protein content?",
    "code": "# Find the recipe_id with the highest protein content\nhighest_protein_recipe_id = Nutrition.loc[Nutrition['protein'].idxmax(), 'recipe_id']\n\n# Find the total cooking time for this recipe\ntotal_cooking_time = Recipe.loc[Recipe['recipe_id'] == highest_protein_recipe_id, ['prep_min', 'cook_min', 'stnd_min']].sum().sum()\nprint(total_cooking_time)",
    "database": "cookbook",
    "choices": [
      "105",
      "180",
      "1505",
      "103"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "Which recipe has the highest percentage of calories from carbohydrates?",
    "code": "# Find the recipe_id with the highest percentage of calories from carbohydrates\nhighest_carb_recipe_id = Nutrition.loc[Nutrition['pcnt_cal_carb'].idxmax(), 'recipe_id']\n\n# Find the title of this recipe from the Recipe table\nhighest_carb_recipe_title = Recipe.loc[Recipe['recipe_id'] == highest_carb_recipe_id, 'title'].values[0]\nprint(highest_carb_recipe_title)",
    "database": "cookbook",
    "choices": [
      "-Stewed Rhubarb-",
      "-Jam-",
      "Lemony Poached Golden Apples",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the total consumption for customers whose CustomerID appears in the transactions_1k table?",
    "code": "# Merge transactions_1k with yearmonth on CustomerID to get Consumption data\nmerged = transactions_1k.merge(yearmonth, on='CustomerID')\n\n# Sum the Consumption for unique customer entries in the merged DataFrame\ntotal_consumption = merged['Consumption'].unique().sum()\n\nprint(total_consumption)",
    "database": "debit_card_specializing",
    "choices": [
      "2455338.4000000004",
      "0.0",
      "23696.11",
      "465132.56999999995"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many distinct products have been purchased by SMEs?",
    "code": "sme_customers = customers[customers['Segment'] == 'SME']\nsme_transactions = transactions_1k[transactions_1k['CustomerID'].isin(sme_customers['CustomerID'])]\ndistinct_products = sme_transactions['ProductID'].nunique()\nprint(distinct_products)",
    "database": "debit_card_specializing",
    "choices": [
      "1",
      "10",
      "3",
      "0"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "Which product has generated the highest total revenue across all transactions?",
    "code": "# Merge transactions_1k with products to obtain product descriptions\nmerged_data = transactions_1k.merge(products, on='ProductID')\n\n# Calculate total revenue for each product (Price * Amount) and find the product with the highest value\nproduct_revenue = merged_data.groupby('Description').apply(lambda x: (x['Price'] * x['Amount']).sum())\n\n# Find the product with the highest total revenue\ntop_product = product_revenue.idxmax()\nprint(top_product)",
    "database": "debit_card_specializing",
    "choices": [
      "Nafta Plus",
      "Natural",
      "Nafta",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the total consumption for customers who have transactions also listed in transactions_1k?",
    "code": "# Get unique CustomerIDs from transactions_1k\ntransaction_customers = transactions_1k['CustomerID'].unique()\n\n# Filter yearmonth by these CustomerIDs and sum up the consumption\ntotal_consumption = yearmonth[yearmonth['CustomerID'].isin(transaction_customers)]['Consumption'].sum()\nprint(total_consumption)",
    "database": "debit_card_specializing",
    "choices": [
      "2455338.4",
      "465132.57000000007",
      "0.0",
      "23696.11"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many gas stations are categorized as \"Premium\" in the CZE country?",
    "code": "premium_cze_gasstations = gasstations[(gasstations['Segment'] == 'Premium') & (gasstations['Country'] == 'CZE')]\nnum_premium_cze_gasstations = len(premium_cze_gasstations)\nprint(num_premium_cze_gasstations)",
    "database": "debit_card_specializing",
    "choices": [
      "52",
      "100",
      "28",
      "411"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "Which gas station has the highest total transaction amount?",
    "code": "total_by_gasstation = transactions_1k.groupby('GasStationID')['Amount'].sum()\nmax_gasstation_id = total_by_gasstation.idxmax()\nprint(max_gasstation_id)",
    "database": "debit_card_specializing",
    "choices": [
      "4347",
      "3250",
      "1425",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of matches played by Belgian teams as either HomeTeam or AwayTeam in the dataset?",
    "code": "belgian_divisions = divisions[divisions['country'] == 'Belgium']['division'].tolist()\nbelgian_matches = matchs[(matchs['Div'].isin(belgian_divisions))]\ntotal_belgian_matches = len(belgian_matches)\nprint(total_belgian_matches)",
    "database": "european_football_1",
    "choices": [
      "6",
      "94",
      "45",
      "22"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many matches have been played in the \u201cLigue 2\u201d division?",
    "code": "ligue2_division = divisions[divisions['name'] == 'Ligue 2']['division'].iloc[0]\nligue2_matches = matchs[matchs['Div'] == ligue2_division]\nligue2_match_count = len(ligue2_matches)\nprint(ligue2_match_count)",
    "database": "european_football_1",
    "choices": [
      "138",
      "15",
      "69",
      "31"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "What is the name of the division with the least number of matches played?",
    "code": "match_counts = matchs['Div'].value_counts()\nleast_played_division_code = match_counts.idxmin()\nleast_played_division_name = divisions[divisions['division'] == least_played_division_code]['name'].iloc[0]\nprint(least_played_division_name)",
    "database": "european_football_1",
    "choices": [
      "Scottish Championship",
      "Scottish League One",
      "Division 1A",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "During which season did the Turkish league (S\u00fcper Lig) record the highest number of games?",
    "code": "# Filter the matchs DataFrame for S\u00fcper Lig matches\ns\u00fcper_lig_matches = matchs[matchs['Div'] == 'T1']\n\n# Count the number of matches per season\ngames_per_season = s\u00fcper_lig_matches.groupby('season').size()\n\n# Find the season with the most games\nmost_games_season = games_per_season.idxmax()\n\nprint(most_games_season)",
    "database": "european_football_1",
    "choices": [
      "2018",
      "2012",
      "2015",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of goals scored by teams from Italy as the home team across all seasons?",
    "code": "italy_matches = matchs.merge(divisions[divisions['country'] == 'Italy'], left_on='Div', right_on='division')\ntotal_italy_home_goals = italy_matches['FTHG'].sum()\nprint(total_italy_home_goals)",
    "database": "european_football_1",
    "choices": [
      "64",
      "452",
      "123",
      "218"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of clients who own a card?",
    "code": "clients_with_card = disp[disp['disp_id'].isin(card['disp_id'])]['client_id'].nunique()\nprint(clients_with_card)",
    "database": "financial",
    "choices": [
      "64",
      "256",
      "128",
      "512"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average loan amount for clients who own a 'classic' card?",
    "code": "classic_card_disp_ids = card[card['type'] == 'classic']['disp_id']\nclassic_card_client_ids = disp[disp['disp_id'].isin(classic_card_disp_ids)]['client_id']\nclassic_card_account_ids = disp[disp['client_id'].isin(classic_card_client_ids)]['account_id']\nclassic_card_loans = loan[loan['account_id'].isin(classic_card_account_ids)]\naverage_loan_amount = classic_card_loans['amount'].mean()\nprint(average_loan_amount)",
    "database": "financial",
    "choices": [
      "134133.09677419355",
      "112463.07692307692",
      "197808.0",
      "178776.0"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many clients have both a loan and a card issued to them?",
    "code": "loan_account_ids = loan['account_id'].unique()\nloan_client_ids = disp[disp['account_id'].isin(loan_account_ids)]['client_id']\ncard_client_ids = disp[disp['disp_id'].isin(card['disp_id'])]['client_id']\nclients_with_loan_and_card = loan_client_ids[loan_client_ids.isin(card_client_ids)].nunique()\nprint(clients_with_loan_and_card)",
    "database": "financial",
    "choices": [
      "17",
      "80",
      "2",
      "1"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of transactions made by clients who received a loan?",
    "code": "loan_account_ids = loan['account_id'].unique()\nloan_transaction_count = trans[trans['account_id'].isin(loan_account_ids)].shape[0]\nprint(loan_transaction_count)",
    "database": "financial",
    "choices": [
      "3",
      "7",
      "74",
      "18"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many clients have an account in the same district as their card was issued?",
    "code": "merged_data = disp.merge(client, on='client_id').merge(account, on=['account_id', 'district_id']).merge(card, on='disp_id')\nunique_clients = len(merged_data.client_id.unique())\nprint(unique_clients)",
    "database": "financial",
    "choices": [
      "219",
      "456",
      "51",
      "109"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the total loan amount for accounts that have made orders with the k_symbol \"SIPO\"?",
    "code": "order_sipo = order[order['k_symbol'] == 'SIPO']\nmerged_loans = loan.merge(order_sipo, on='account_id')\ntotal_loan_amount = merged_loans['amount_x'].sum()  # Using amount_x due to the suffix applied during merge\nprint(total_loan_amount)",
    "database": "financial",
    "choices": [
      "0",
      "107352",
      "3861408",
      "554724"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of orders made by clients who have a classic card?",
    "code": "classic_card_clients = card[card['type'] == 'classic'].merge(disp, on='disp_id')\nclassic_card_orders = classic_card_clients.merge(order, on='account_id')\ntotal_orders = len(classic_card_orders)\nprint(total_orders)",
    "database": "financial",
    "choices": [
      "1",
      "36",
      "5",
      "0"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the average loan amount for clients in districts with a higher than average A11?",
    "code": "average_a11 = district['A11'].mean()\nhigh_a11_districts = district[district['A11'] > average_a11]\naccounts_in_high_a11 = account[account['district_id'].isin(high_a11_districts['district_id'])]\nloans_in_high_a11 = loan.merge(accounts_in_high_a11, on='account_id')\naverage_loan_amount = loans_in_high_a11['amount'].mean()\nprint(average_loan_amount)",
    "database": "financial",
    "choices": [
      "122026.90909090909",
      "197808.0",
      "132970.1739130435",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "What is the total payment amount for Completed loans?",
    "code": "completed_loans = loan[loan['status'] == 'A']\ntotal_payment_amount = completed_loans['payments'].sum()\nprint(total_payment_amount)",
    "database": "financial",
    "choices": [
      "82627.0",
      "656973.0",
      "344016.0",
      "170666.0"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of clients from 'Prague' with at least one loan?",
    "code": "# Merge necessary tables to find clients in Prague\nprague_clients = client.merge(district, left_on='district_id', right_on='district_id').query(\"A3 == 'Prague'\")['client_id']\n\n# Get accounts from the 'disp' table for these clients\nprague_client_accounts = disp.query(\"client_id in @prague_clients\")['account_id']\n\n# Count unique prague clients who have loans\ntotal_clients_with_loans = loan.query(\"account_id in @prague_client_accounts\")['account_id'].nunique()\n\nprint(total_clients_with_loans)",
    "database": "financial",
    "choices": [
      "2",
      "1",
      "8",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many businesses have received a routine inspection score?",
    "code": "unique_business_routine_scores = inspections[(inspections['type'] == 'Routine - Unscheduled') & (inspections['score'].notna())]['business_id'].nunique()\nprint(unique_business_routine_scores)",
    "database": "food_inspection",
    "choices": [
      "52",
      "413",
      "109",
      "208"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average inspection score for businesses located in San Francisco (SF)?",
    "code": "sf_businesses = businesses[businesses['city'].str.upper() == 'SF']['business_id']\nsf_inspections = inspections[inspections['business_id'].isin(sf_businesses) & inspections['score'].notna()]\naverage_sf_score = sf_inspections['score'].mean()\nprint(average_sf_score)",
    "database": "food_inspection",
    "choices": [
      "89.71428571428571",
      "90.28260869565217",
      "90.5",
      "89.15384615384616"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "How many businesses with a high-risk violation exist?",
    "code": "high_risk_businesses = violations[violations['risk_category'] == 'High Risk']['business_id'].unique()\nnum_high_risk_businesses = len(high_risk_businesses)\nprint(num_high_risk_businesses)",
    "database": "food_inspection",
    "choices": [
      "17",
      "113",
      "33",
      "58"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What percentage of businesses have at least one violation recorded in the dataset?",
    "code": "violated_businesses = violations['business_id'].unique()\ntotal_businesses = businesses['business_id'].nunique()\npercentage_violated = (len(violated_businesses) / total_businesses) * 100\nprint(percentage_violated)",
    "database": "food_inspection",
    "choices": [
      "98.15078236130867",
      "99.17808219178083",
      "101.06382978723406",
      "98.3957219251337"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the average inspection score for businesses with a listed phone number?",
    "code": "merged = businesses.dropna(subset=['phone_number']).merge(inspections, on='business_id', how='inner')\naverage_score = merged['score'].mean()\nprint(average_score)",
    "database": "food_inspection",
    "choices": [
      "90.27118644067797",
      "91.8",
      "90.88679245283019",
      "89.64285714285714"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many businesses, identified by their names, have received a routine inspection score above 90?",
    "code": "merged = businesses.merge(inspections, on='business_id', how='inner')\nhigh_score = merged.loc[(merged['score'] > 90) & (merged['type'] == 'Routine - Unscheduled')]\nunique_business_names = len(high_score['name'].unique())\nprint(unique_business_names)",
    "database": "food_inspection",
    "choices": [
      "230",
      "53",
      "112",
      "29"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many inspections resulted in violations for businesses located in San Francisco?",
    "code": "sf_businesses = businesses[businesses['city'] == 'San Francisco']\nmerged_inspections = sf_businesses.merge(inspections, on='business_id', how='inner')\nmerged_violations = merged_inspections.merge(violations, on=['business_id', 'date'], how='inner')\ninspection_with_violations = len(merged_violations)\nprint(inspection_with_violations)",
    "database": "food_inspection",
    "choices": [
      "2",
      "1",
      "6",
      "0"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many businesses have had violations without any inspection records?",
    "code": "businesses_with_violations = violations['business_id'].unique()\nbusinesses_with_inspections = inspections['business_id'].unique()\nno_inspection_violations = len(set(businesses_with_violations) - set(businesses_with_inspections))\nprint(no_inspection_violations)",
    "database": "food_inspection",
    "choices": [
      "563",
      "95",
      "179",
      "336"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average inspection score for businesses located in San Francisco?",
    "code": "sf_business_ids = businesses[businesses['city'] == \"San Francisco\"]['business_id']\nsf_inspections = inspections[inspections['business_id'].isin(sf_business_ids)]\naverage_score_sf = sf_inspections['score'].mean()\nprint(average_score_sf)",
    "database": "food_inspection",
    "choices": [
      "88.8125",
      "88.625",
      "89.25862068965517",
      "90.36842105263158"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "Which business with available business certificate data has received the highest inspection score?",
    "code": "non_nan_certificates = businesses[businesses['business_certificate'].notna()]['business_id']\nvalid_inspections = inspections[inspections['business_id'].isin(non_nan_certificates)]\nhighest_score_business_id = valid_inspections.loc[valid_inspections['score'].idxmax()]['business_id']\nhighest_score_business_name = businesses[businesses['business_id'] == highest_score_business_id]['name'].values[0]\nprint(highest_score_business_name)",
    "database": "food_inspection",
    "choices": [
      "LAW SCHOOL CAFE",
      "S & P Concessions - Mendocino",
      "YICK'S LIQUOR",
      "Quickly"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many businesses that have been inspected are managed by owners residing outside of San Francisco?",
    "code": "sf_owners = businesses[businesses['owner_city'] != \"San Francisco\"]\ninspected_outside_sf_owners = inspections[inspections['business_id'].isin(sf_owners['business_id'].unique())]\nunique_businesses = inspected_outside_sf_owners['business_id'].nunique()\nprint(unique_businesses)",
    "database": "food_inspection",
    "choices": [
      "58",
      "458",
      "115",
      "247"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many businesses have had inspections without a recorded score?",
    "code": "# Merge inspections with businesses to find which businesses have inspections but with no score\ninspections_no_score = inspections[inspections['score'].isnull()]\nbusinesses_with_no_score_inspections = inspections_no_score.merge(businesses, on='business_id')\nnumber_of_businesses = businesses_with_no_score_inspections['business_id'].nunique()\nprint(number_of_businesses)",
    "database": "food_inspection",
    "choices": [
      "168",
      "81",
      "323",
      "43"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many unique businesses have both violations and inspections recorded?",
    "code": "# Find unique business_ids that have both violations and inspections\nbusinesses_with_violations = violations['business_id'].unique()\nbusinesses_with_inspections = inspections['business_id'].unique()\ncommon_businesses = set(businesses_with_violations).intersection(businesses_with_inspections)\ncount_common_businesses = len(common_businesses)\nprint(count_common_businesses)",
    "database": "food_inspection",
    "choices": [
      "0",
      "5",
      "127",
      "26"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the number of businesses that have either 'New Ownership' or 'New Construction' inspection types but do not have any associated high-risk violations?",
    "code": "# Find businesses with specified inspection types, exclude those with high-risk violations\nspecified_inspections = inspections[inspections['type'].isin(['New Ownership', 'New Construction'])]\nhigh_risk_violation_businesses = violations[violations['risk_category'] == 'High Risk']['business_id'].unique()\nbusinesses_no_high_risk = specified_inspections[~specified_inspections['business_id'].isin(high_risk_violation_businesses)]\nnumber_of_businesses = businesses_no_high_risk['business_id'].nunique()\nprint(number_of_businesses)",
    "database": "food_inspection",
    "choices": [
      "17",
      "56",
      "33",
      "97"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average inspection score of businesses with \"H25\" tax code?",
    "code": "business_ids_with_h25 = businesses[businesses['tax_code'] == 'H25']['business_id']\nh25_inspections = inspections[inspections['business_id'].isin(business_ids_with_h25)]\naverage_score_h25 = h25_inspections['score'].mean()\nprint(average_score_h25)",
    "database": "food_inspection",
    "choices": [
      "85.25806451612904",
      "87.824",
      "85.5",
      "86.15873015873017"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the postal code of the business with the highest inspection score?",
    "code": "highest_score_business_id = inspections.iloc[inspections['score'].idxmax()]['business_id']\npostal_code = businesses[businesses['business_id'] == highest_score_business_id]['postal_code'].iloc[0]\nprint(postal_code)",
    "database": "food_inspection",
    "choices": [
      "94111",
      "94117",
      "94112",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many businesses have received both a \"Routine - Unscheduled\" inspection and have low risk violations?",
    "code": "routine_unscheduled_business_ids = inspections[inspections['type'] == 'Routine - Unscheduled']['business_id'].unique()\nlow_risk_business_ids = violations[violations['risk_category'] == 'Low Risk']['business_id'].unique()\ncombined_count = len(set(routine_unscheduled_business_ids).intersection(low_risk_business_ids))\nprint(combined_count)",
    "database": "food_inspection",
    "choices": [
      "0",
      "46",
      "3",
      "8"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which constructor had the highest number of wins documented in the constructorStandings table?",
    "code": "merged = constructorStandings.merge(constructors, on='constructorId')\nconstructor_wins = merged.groupby('name').sum('wins')\ntop_constructor = constructor_wins['wins'].idxmax()\nprint(top_constructor)",
    "database": "formula_1",
    "choices": [
      "Ferrari",
      "McLaren",
      "Benetton",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "Which circuit has hosted the maximum number of races?",
    "code": "race_counts = races.groupby('circuitId').size()\ntop_circuit_id = race_counts.idxmax()\ncircuit_info = circuits[circuits['circuitId'] == top_circuit_id]\ncircuit_name = circuit_info['name'].values[0]\nprint(circuit_name)",
    "database": "formula_1",
    "choices": [
      "Silverstone Circuit",
      "Circuit de Monaco",
      "Autodromo Nazionale di Monza",
      "Hockenheimring"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many unique circuits have hosted races where Renault scored points?",
    "code": "# Find races where Renault scored points\nrenault_id = constructors[constructors['name'] == 'Renault']['constructorId'].iloc[0]\nrenault_races_with_points = constructorResults[(constructorResults['constructorId'] == renault_id) & (constructorResults['points'] > 0)]\n\n# Get the circuitIds related to these races\nrenault_circuits = races[races['raceId'].isin(renault_races_with_points['raceId'])]['circuitId'].unique()\n\n# Count unique circuits\nunique_renault_circuits_count = len(renault_circuits)\nprint(unique_renault_circuits_count)",
    "database": "formula_1",
    "choices": [
      "4",
      "0",
      "1",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What proportion of genes on chromosome 7 are part of the Proteasome complex?",
    "code": "genes_on_chr7 = Genes[Genes['Chromosome'] == 7]\nproteasome_genes_on_chr7 = genes_on_chr7[genes_on_chr7['Complex'] == 'Proteasome']\nproportion = len(proteasome_genes_on_chr7) / len(genes_on_chr7) if len(genes_on_chr7) > 0 else 0\nprint(proportion)",
    "database": "genes",
    "choices": [
      "0.037037037037037035",
      "0.03773584905660377",
      "0.07407407407407407",
      "0.14285714285714285"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the average expression correlation for physical interactions?",
    "code": "physical_interactions = Interactions[Interactions['Type'] == 'Physical']\naverage_expression_corr = physical_interactions['Expression_Corr'].mean()\nprint(average_expression_corr)",
    "database": "genes",
    "choices": [
      "0.34214796894495403",
      "0.34128793198305085",
      "0.3392569634608379",
      "0.3498349179613734"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many genes localized in the cytoplasm are involved in physical interactions?",
    "code": "cytoplasm_genes = Genes.loc[Genes['Localization'] == 'cytoplasm', 'GeneID']\nphysical_interactions = Interactions.loc[Interactions['Type'] == 'Physical']\ncytoplasm_physical_interactions = physical_interactions[\n    (physical_interactions['GeneID1'].isin(cytoplasm_genes)) | \n    (physical_interactions['GeneID2'].isin(cytoplasm_genes))\n]\nnumber_of_cytoplasm_physical_interactions = len(cytoplasm_physical_interactions['GeneID1'].unique()) + len(cytoplasm_physical_interactions['GeneID2'].unique()) \nprint(number_of_cytoplasm_physical_interactions)",
    "database": "genes",
    "choices": [
      "47",
      "156",
      "107",
      "10"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many genes involved in 'Protein Synthesis' have identified interactions?",
    "code": "protein_synthesis_genes = Genes[Genes['Function'].str.contains('PROTEIN SYNTHESIS', na=False)]\ninteracting_genes = Interactions[(Interactions['GeneID1'].isin(protein_synthesis_genes['GeneID'])) |\n                                 (Interactions['GeneID2'].isin(protein_synthesis_genes['GeneID']))]\nunique_genes_in_interactions = set(interacting_genes['GeneID1']).union(set(interacting_genes['GeneID2']))\nprotein_synthesis_genes_with_interactions = protein_synthesis_genes['GeneID'].isin(unique_genes_in_interactions).sum()\nprint(protein_synthesis_genes_with_interactions)",
    "database": "genes",
    "choices": [
      "13",
      "25",
      "1",
      "6"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many genes involved in 'Cell Growth, Cell Division and DNA Synthesis' have a known motif?",
    "code": "cell_growth_genes = Genes[Genes['Function'].str.contains('CELL GROWTH, CELL DIVISION AND DNA SYNTHESIS', na=False)]\ngenes_with_motif = cell_growth_genes['Motif'].notnull().sum()\nprint(genes_with_motif)",
    "database": "genes",
    "choices": [
      "36",
      "64",
      "251",
      "127"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the correlation of expression for the interaction between essential genes with localization in the nucleus?",
    "code": "nucleus_essential_genes = Genes[(Genes['Essential'] == 'Essential') & (Genes['Localization'] == 'nucleus')]\ninteractions_nucleus_essential = Interactions[\n    (Interactions['GeneID1'].isin(nucleus_essential_genes['GeneID'])) &\n    (Interactions['GeneID2'].isin(nucleus_essential_genes['GeneID']))\n]\nnucleus_gene_correlations = interactions_nucleus_essential['Expression_Corr']\naverage_correlation = nucleus_gene_correlations.mean() if not nucleus_gene_correlations.empty else None\nprint(average_correlation)",
    "database": "genes",
    "choices": [
      "0.6345136616666667",
      "0.3987144496944445",
      "0.43611620165",
      "0.468805052"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "Calculate the average height in cm of players who have appeared in at least one game.",
    "code": "merged_data = PlayerInfo.merge(height_info, how='left', left_on='height', right_on='height_id')\nplayers_with_games = merged_data[merged_data['GP_greater_than_0'] == 'yes']\naverage_height_cm = players_with_games['height_in_cm'].mean()\nprint(average_height_cm)",
    "database": "ice_hockey_draft",
    "choices": [
      "185.26415094339623",
      "185.43781094527364",
      "185.37853107344634",
      "185.18333333333334"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of goals (G) scored by players drafted in a round below the fifth?",
    "code": "PlayerInfo_filtered = PlayerInfo[PlayerInfo['draftround'] < 5]\nmerged_data = PlayerInfo_filtered.merge(SeasonStatus, on='ELITEID', how='inner')\ntotal_goals = merged_data['G'].sum()\nprint(total_goals)",
    "database": "ice_hockey_draft",
    "choices": [
      "1758",
      "450",
      "3514",
      "919"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "What is the average total points (P) per game played (GP) for the players listed in PlayerInfo who have more than 0 sum_7yr_GP?",
    "code": "# Filter players with sum_7yr_GP > 0\neligible_players = PlayerInfo[PlayerInfo['sum_7yr_GP'] > 0]\n# Merge eligible players with SeasonStatus to get season data\nmerged_data = eligible_players.merge(SeasonStatus, on='ELITEID')\n# Calculate total points and games played\ntotal_points_per_game = merged_data['P'].sum() / merged_data['GP'].sum()\nprint(total_points_per_game)",
    "database": "ice_hockey_draft",
    "choices": [
      "0.7191135734072022",
      "0.717391304347826",
      "0.7302072714916752",
      "0.7394966118102614"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which player in the PlayerInfo table has the most entries in the SeasonStatus table? ",
    "code": "# Count the number of entries per player in SeasonStatus\nentries_per_player = SeasonStatus['ELITEID'].value_counts()\n# Get the ELITEID of the player with the most entries\nplayer_most_entries = entries_per_player.idxmax()\n# Find the player's name in the PlayerInfo table\nplayer_name = PlayerInfo.loc[PlayerInfo['ELITEID'] == player_most_entries, 'PlayerName'].values[0]\nprint(player_name)",
    "database": "ice_hockey_draft",
    "choices": [
      "Johan Halvardsson",
      "Jonas Nordqvist",
      "Mattias Tedenby",
      "Jonas Johansson"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "What is the average height in cm of players who have played more than 10 games in a season?",
    "code": "# Filter SeasonStatus for players with GP greater than 10\nplayers_with_games = SeasonStatus[SeasonStatus['GP'] > 10]['ELITEID'].unique()\n\n# Get heights for these players from PlayerInfo and calculate the average height in cm\naverage_height_cm = PlayerInfo[PlayerInfo['ELITEID'].isin(players_with_games)].merge(height_info, left_on='height', right_on='height_id')['height_in_cm'].mean()\n\nprint(average_height_cm)",
    "database": "ice_hockey_draft",
    "choices": [
      "185.64694280078896",
      "185.46886446886447",
      "185.93333333333334",
      "185.6326530612245"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many players have been drafted but never played a game?",
    "code": "# Filter PlayerInfo for players drafted and not played any game\nplayers_never_played = PlayerInfo[(PlayerInfo['draftyear'].notnull()) & (PlayerInfo['sum_7yr_GP'] == 0)].shape[0]\n\nprint(players_never_played)",
    "database": "ice_hockey_draft",
    "choices": [
      "138",
      "472",
      "259",
      "64"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the number of different teams played by players from Canada in the `SeasonStatus` table?",
    "code": "# Get Canadian players from PlayerInfo\ncanadian_players = PlayerInfo[PlayerInfo['nation'] == 'Canada']['ELITEID'].unique()\n\n# Count distinct teams for these players in SeasonStatus\nunique_teams_canadian = SeasonStatus[SeasonStatus['ELITEID'].isin(canadian_players)]['TEAM'].nunique()\n\nprint(unique_teams_canadian)",
    "database": "ice_hockey_draft",
    "choices": [
      "53",
      "84",
      "119",
      "30"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the most common birthplace of players who achieved over 50 total points in a season in the `SeasonStatus` table?",
    "code": "# Calculate total points (G + A) for each season and filter those over 50\nplayers_over_50_points = SeasonStatus.groupby('ELITEID')[['G', 'A']].sum().reset_index()\nplayers_over_50_points['total_points'] = players_over_50_points['G'] + players_over_50_points['A']\nplayers_ids = players_over_50_points[players_over_50_points['total_points'] > 50]['ELITEID']\n\n# Find the most common birthplace from these players\nmost_common_birthplace = PlayerInfo[PlayerInfo['ELITEID'].isin(players_ids)]['birthplace'].mode()[0]\n\nprint(most_common_birthplace)",
    "database": "ice_hockey_draft",
    "choices": [
      "Edmonton, AB, CAN",
      "Buckingham, QC, CAN",
      "Augusta, GA, USA",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average weight in kilograms of players who have played more than zero games?",
    "code": "# Filter players from PlayerInfo who have played more than zero games\nplayers_with_games = PlayerInfo[PlayerInfo[\"GP_greater_than_0\"] == \"yes\"]\n\n# Calculate the average weight in kilograms for these players\naverage_weight_kg = players_with_games[\"weight\"].mean()\n\nprint(average_weight_kg)",
    "database": "ice_hockey_draft",
    "choices": [
      "205.1044776119403",
      "204.34463276836158",
      "205.48333333333332",
      "205.0943396226415"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the average height in centimeters of players drafted by the \"Anaheim Ducks\"?",
    "code": "# Filter players drafted by Anaheim Ducks\nanaheim_ducks_players = PlayerInfo[PlayerInfo[\"overallby\"] == \"Anaheim Ducks\"]\n\n# Get heights in centimeters for these players\nplayer_heights_cm = anaheim_ducks_players[\"height\"]\n\n# Use height_info to get the corresponding height in centimeters\nheights_cm = height_info[height_info[\"height_id\"].isin(player_heights_cm)][\"height_in_cm\"]\n\n# Calculate the average height in centimeters\naverage_height_cm = heights_cm.mean()\n\nprint(average_height_cm)",
    "database": "ice_hockey_draft",
    "choices": [
      "186.0",
      "183.75",
      "187.8",
      "184.77777777777777"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which player has the highest plus/minus in the 'WC' league?",
    "code": "# Filter games in WC league\nwc_league_games = SeasonStatus[SeasonStatus[\"LEAGUE\"] == \"WC\"]\n\n# Find the player with the highest plus/minus\nplayer_highest_plusminus = wc_league_games.loc[wc_league_games[\"PLUSMINUS\"].idxmax()]\n\n# Get the ELITEID of the player\nelite_id_highest_plusminus = player_highest_plusminus[\"ELITEID\"]\n\n# Find the player's name from PlayerInfo\nplayer_name = PlayerInfo.loc[PlayerInfo[\"ELITEID\"] == elite_id_highest_plusminus, \"PlayerName\"].iloc[0]\n\nprint(player_name)",
    "database": "ice_hockey_draft",
    "choices": [
      "Karel Pilar",
      "Mark Streit",
      "Petteri Nummelin",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many players from the 'Canada' nation have been drafted in the second round?",
    "code": "# Filter Canadian players from PlayerInfo who were drafted in the second round\ncanadian_players_drafted_2nd_round = PlayerInfo[(PlayerInfo[\"nation\"] == \"Canada\") & (PlayerInfo[\"draftround\"] == 2)]\n\n# Count these players\nnum_canadian_players_drafted_2nd = len(canadian_players_drafted_2nd_round)\n\nprint(num_canadian_players_drafted_2nd)",
    "database": "ice_hockey_draft",
    "choices": [
      "51",
      "10",
      "33",
      "18"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many times does the PRED_CLASS \"wearing\" appear in IMG_REL?",
    "code": "# Find PRED_CLASS_ID for \"wearing\" from PRED_CLASSES\nwearing_id = PRED_CLASSES.loc[PRED_CLASSES['PRED_CLASS'] == 'wearing', 'PRED_CLASS_ID'].values[0]\n# Count occurrences of \"wearing\" in IMG_REL\nwearing_count = IMG_REL[IMG_REL['PRED_CLASS_ID'] == wearing_id].shape[0]\nprint(wearing_count)",
    "database": "image_and_language",
    "choices": [
      "9",
      "17",
      "3",
      "4"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the smallest width (W) of the objects labeled with the attribute \"dark\"?",
    "code": "# Find ATT_CLASS_ID for \"dark\" in ATT_CLASSES\ndark_id = ATT_CLASSES.loc[ATT_CLASSES['ATT_CLASS'] == 'dark', 'ATT_CLASS_ID'].values[0]\n# Find all OBJ_SAMPLE_IDs in IMG_OBJ_ATT that have the \"dark\" attribute\ndark_obj_samples = IMG_OBJ_ATT[IMG_OBJ_ATT['ATT_CLASS_ID'] == dark_id]['OBJ_SAMPLE_ID']\n# Filter IMG_OBJ for these OBJ_SAMPLE_IDs, and find the minimum width (W)\nsmallest_dark_width = IMG_OBJ[IMG_OBJ['OBJ_SAMPLE_ID'].isin(dark_obj_samples)]['W'].min()\nprint(smallest_dark_width)",
    "database": "image_and_language",
    "choices": [
      "4",
      "5",
      "3",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the name of the first attribute class that appears in IMG_OBJ_ATT?",
    "code": "# Get the first ATT_CLASS_ID from IMG_OBJ_ATT\nfirst_att_class_id = IMG_OBJ_ATT['ATT_CLASS_ID'].iloc[0]\n# Find the corresponding ATT_CLASS name in ATT_CLASSES\nfirst_att_class_name = ATT_CLASSES.loc[ATT_CLASSES['ATT_CLASS_ID'] == first_att_class_id, 'ATT_CLASS'].values[0]\nprint(first_att_class_name)",
    "database": "image_and_language",
    "choices": [
      "bunched",
      "very tall",
      "jean",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "Which image (IMG_ID) contains the largest number of unique object classes?",
    "code": "# Group IMG_OBJ by IMG_ID and calculate the number of unique OBJ_CLASS_IDs in each image\nunique_obj_counts = IMG_OBJ.groupby('IMG_ID')['OBJ_CLASS_ID'].nunique()\n# Find the IMG_ID with the maximum count of unique object classes\nimg_with_most_unique_objs = unique_obj_counts.idxmax()\nprint(img_with_most_unique_objs)",
    "database": "image_and_language",
    "choices": [
      "2365059",
      "2410461",
      "2401953",
      "2386101"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the most common object class described as \"dark\" across all image objects?",
    "code": "dark_att_id = ATT_CLASSES.loc[ATT_CLASSES['ATT_CLASS'] == 'dark', 'ATT_CLASS_ID'].values[0]\ndark_objects = IMG_OBJ_ATT[IMG_OBJ_ATT['ATT_CLASS_ID'] == dark_att_id]['OBJ_SAMPLE_ID']\ndark_obj_classes = IMG_OBJ[IMG_OBJ['OBJ_SAMPLE_ID'].isin(dark_objects)]\nmost_common_dark_obj_class_id = dark_obj_classes['OBJ_CLASS_ID'].mode().iloc[0]\nmost_common_dark_obj = OBJ_CLASSES.loc[OBJ_CLASSES['OBJ_CLASS_ID'] == most_common_dark_obj_class_id, 'OBJ_CLASS'].values[0]\nprint(most_common_dark_obj)",
    "database": "image_and_language",
    "choices": [
      "arm",
      "sign",
      "building",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "For images containing \"umbrella,\" how many unique attributes are associated with these objects?",
    "code": "umbrella_obj_id = OBJ_CLASSES.loc[OBJ_CLASSES['OBJ_CLASS'] == 'umbrellas', 'OBJ_CLASS_ID'].values[0]\numbrella_objects = IMG_OBJ[IMG_OBJ['OBJ_CLASS_ID'] == umbrella_obj_id]['OBJ_SAMPLE_ID']\numbrella_attrs = IMG_OBJ_ATT[IMG_OBJ_ATT['OBJ_SAMPLE_ID'].isin(umbrella_objects)]\nunique_attributes_count = umbrella_attrs['ATT_CLASS_ID'].nunique()\nprint(unique_attributes_count)",
    "database": "image_and_language",
    "choices": [
      "29",
      "3",
      "7",
      "13"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of unique object classes associated with attribute classes in the dataset?",
    "code": "merged_data = IMG_OBJ.merge(IMG_OBJ_ATT, on=['IMG_ID', 'OBJ_SAMPLE_ID'])\nunique_classes = merged_data['OBJ_CLASS_ID'].nunique()\nprint(unique_classes)",
    "database": "image_and_language",
    "choices": [
      "88",
      "29",
      "136",
      "48"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many unique images contain an object with both object and attribute classes defined?",
    "code": "merged_obj_att = IMG_OBJ.merge(IMG_OBJ_ATT, on=['IMG_ID', 'OBJ_SAMPLE_ID']).merge(OBJ_CLASSES, on='OBJ_CLASS_ID').merge(ATT_CLASSES, on='ATT_CLASS_ID')\nunique_images_with_classes = merged_obj_att['IMG_ID'].nunique()\nprint(unique_images_with_classes)",
    "database": "image_and_language",
    "choices": [
      "128",
      "256",
      "64",
      "32"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the most frequently appearing object class in images associated with a specific attribute class (e.g., 'blue')?",
    "code": "blue_att_class_id = ATT_CLASSES.query(\"ATT_CLASS == 'blue'\")['ATT_CLASS_ID'].iloc[0]\nblue_objects = IMG_OBJ_ATT.query(f\"ATT_CLASS_ID == {blue_att_class_id}\").merge(IMG_OBJ, on=['IMG_ID', 'OBJ_SAMPLE_ID']).merge(OBJ_CLASSES, on='OBJ_CLASS_ID')\nmost_frequent_obj_in_blue = blue_objects['OBJ_CLASS'].mode()[0]\nprint(most_frequent_obj_in_blue)",
    "database": "image_and_language",
    "choices": [
      "sky",
      "shirt",
      "plane",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many unique relationships exist between objects in the \"IMG_REL\" table?",
    "code": "unique_relationships = IMG_REL.merge(PRED_CLASSES, on='PRED_CLASS_ID')['PRED_CLASS'].nunique()\nprint(unique_relationships)",
    "database": "image_and_language",
    "choices": [
      "19",
      "26",
      "15",
      "38"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the most frequently appearing object class in the dataset?",
    "code": "merged_data = IMG_OBJ.merge(OBJ_CLASSES, on=\"OBJ_CLASS_ID\")\nmost_common_object_class = merged_data['OBJ_CLASS'].mode()[0]\nprint(most_common_object_class)",
    "database": "image_and_language",
    "choices": [
      "man",
      "windows",
      "trees",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the number of current politicians listed in current_terms who have a religious affiliation based on the \"religion_bio\" field in the current table?",
    "code": "current_ids_with_religion = current[['bioguide_id', 'religion_bio']].dropna()\ncurrent_terms_ids = current_terms['bioguide'].dropna().unique()\nids_with_religion_in_terms = current_ids_with_religion[current_ids_with_religion['bioguide_id'].isin(current_terms_ids)].dropna()['bioguide_id'].nunique()\nprint(ids_with_religion_in_terms)",
    "database": "legislator",
    "choices": [
      "86",
      "16",
      "56",
      "30"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many unique politicians in the current table have served in more than one term in their current role, based on the current_terms table?",
    "code": "current_terms_count = current_terms.groupby('bioguide').size()\nmultiple_terms = current_terms_count[current_terms_count > 1].index\nunique_politicians_multiple_terms = len(set(current['bioguide_id'].dropna()).intersection(multiple_terms))\nprint(unique_politicians_multiple_terms)",
    "database": "legislator",
    "choices": [
      "16",
      "5",
      "2",
      "59"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many individuals are there in the current dataset who have a social media presence on Facebook?",
    "code": "current_bioguide_ids = current['bioguide_id'].tolist()\nfacebook_users = social_media[social_media['facebook'].notna() & social_media['bioguide'].isin(current_bioguide_ids)]\nnum_facebook_users = len(facebook_users)\nprint(num_facebook_users)",
    "database": "legislator",
    "choices": [
      "91",
      "10",
      "3",
      "28"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the most common religion among current members with a Facebook presence?",
    "code": "current_facebook_bioguide_ids = social_media[social_media['facebook'].notna()]['bioguide']\ncurrent_with_facebook = current[current['bioguide_id'].isin(current_facebook_bioguide_ids)]\nmost_common_religion = current_with_facebook['religion_bio'].value_counts().idxmax()\nprint(most_common_religion)",
    "database": "legislator",
    "choices": [
      "Jewish",
      "Catholic",
      "Roman Catholic",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many unique middle names appear in both the current and historical datasets?",
    "code": "current_middle_names = current['middle_name'].dropna().unique()\nhistorical_middle_names = historical['middle_name'].dropna().unique()\ncommon_middle_names = set(current_middle_names).intersection(historical_middle_names)\nnum_unique_common_middle_names = len(common_middle_names)\nprint(num_unique_common_middle_names)",
    "database": "legislator",
    "choices": [
      "14",
      "3",
      "6",
      "8"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the number of females in the current dataset who also have their contact information available in the current_terms table?",
    "code": "female_with_contact = current.merge(current_terms, left_on='bioguide_id', right_on='bioguide', how='inner')\nresult = len(female_with_contact[female_with_contact['gender_bio'] == 'F'])\nprint(result)",
    "database": "legislator",
    "choices": [
      "67",
      "60",
      "45",
      "36"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the average district number for current representatives who are registered as Democrats and have a listed religion in the `current` table?",
    "code": "current_democrats = current_terms.query(\"party == 'Democrat'\")\ncurrent_democrats_ids = current_democrats['bioguide']\ndemocratic_representatives = current[(current['bioguide_id'].isin(current_democrats_ids)) & (current['religion_bio'].notnull())]\naverage_district = current_terms[current_terms['bioguide'].isin(democratic_representatives['bioguide_id'])]['district'].mean()\nprint(average_district)",
    "database": "legislator",
    "choices": [
      "13.25",
      "9.783783783783784",
      "8.661764705882353",
      "8.666666666666666"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many unique users have discussed their mental health with coworkers across all surveys?",
    "code": "unique_users_discussed_mental_health = Answer[(Answer['QuestionID'] == 60)].UserID.nunique()\nprint(unique_users_discussed_mental_health)",
    "database": "mental_health_survey",
    "choices": [
      "56",
      "29",
      "7",
      "15"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of survey responses related to the perception of mental health support in the industry?",
    "code": "total_perception_responses = Answer[Answer['QuestionID'] == 86].shape[0]\nprint(total_perception_responses)",
    "database": "mental_health_survey",
    "choices": [
      "28",
      "10",
      "6",
      "54"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many different states or territories were mentioned by participants in the 2014 survey?",
    "code": "states_2014 = Answer[(Answer['SurveyID'] == 2014) & (Answer['QuestionID'] == 4)].AnswerText.nunique()\nprint(states_2014)",
    "database": "mental_health_survey",
    "choices": [
      "19",
      "15",
      "3",
      "8"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What was the most commonly answered question across all surveys?",
    "code": "most_common_question_id = Answer['QuestionID'].mode()[0]\nmost_common_question_text = Question.loc[Question['questionid'] == most_common_question_id, 'questiontext'].values[0]\nprint(most_common_question_text)",
    "database": "mental_health_survey",
    "choices": [
      "Does your employer provide mental health benefits as part of healthcare coverage?",
      "What is your age?",
      "How many employees does your company or organization have?",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many users participated in the 2016 survey?",
    "code": "users_2016 = Answer[Answer['SurveyID'] == 2016]['UserID'].nunique()\nprint(users_2016)",
    "database": "mental_health_survey",
    "choices": [
      "1259",
      "474",
      "940",
      "267"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "Which year had the least number of answers for the question \"Have you ever been diagnosed with a mental health disorder?\"?",
    "code": "question_id = Question[Question['questiontext'] == \"Have you ever been diagnosed with a mental health disorder?\"]['questionid'].values[0]\nyear_with_least_answers = Answer[Answer['QuestionID'] == question_id].merge(Survey, on='SurveyID').groupby('Description').size().idxmin()\nprint(year_with_least_answers)",
    "database": "mental_health_survey",
    "choices": [
      "mental health survey for 2019",
      "mental health survey for 2018",
      "mental health survey for 2017",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the most common question asked in the 2019 survey?",
    "code": "# Filter for answers from the 2019 survey\nanswers_2019 = Answer[Answer['SurveyID'] == 2019]\n\n# Group by QuestionID to find the most common question\nmost_common_question_id = answers_2019['QuestionID'].mode()[0]\n\n# Find the text of the most common question\nmost_common_question_text = Question.loc[Question['questionid'] == most_common_question_id, 'questiontext'].values[0]\n\nprint(most_common_question_text)",
    "database": "mental_health_survey",
    "choices": [
      "Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?",
      "Overall, how much importance did your previous employer place on mental health?",
      "Describe the conversation you had with your previous coworkers about your mental health including their reactions.",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many participants in the 2018 survey provided an answer other than -1?",
    "code": "# Filter for answers from the 2018 survey\nanswers_2018 = Answer[Answer['SurveyID'] == 2018]\n\n# Count the number of users with answers not equal to -1\nparticipants_with_valid_answers = answers_2018[answers_2018['AnswerText'] != \"-1\"]['UserID'].nunique()\n\nprint(participants_with_valid_answers)",
    "database": "mental_health_survey",
    "choices": [
      "256",
      "133",
      "69",
      "341"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many unique users participated in the 2019 survey?",
    "code": "# Filter for answers from the 2019 survey\nanswers_2019 = Answer[Answer['SurveyID'] == 2019]\n\n# Count the number of unique users\nunique_users_2019 = answers_2019['UserID'].nunique()\n\nprint(unique_users_2019)",
    "database": "mental_health_survey",
    "choices": [
      "319",
      "80",
      "245",
      "122"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many unique users participated in all the surveys combined?",
    "code": "unique_users_count = Answer['UserID'].nunique()\nprint(unique_users_count)",
    "database": "mental_health_survey",
    "choices": [
      "3409",
      "1256",
      "696",
      "2489"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many distinct questions were answered in the 2018 mental health survey?",
    "code": "distinct_questions_2018 = Answer[Answer['SurveyID'] == 2018]['QuestionID'].nunique()\nprint(distinct_questions_2018)",
    "database": "mental_health_survey",
    "choices": [
      "59",
      "71",
      "76",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of users who responded to any question in the 2017 mental health survey?",
    "code": "total_users_2017 = Answer[Answer['SurveyID'] == 2017]['UserID'].nunique()\nprint(total_users_2017)",
    "database": "mental_health_survey",
    "choices": [
      "168",
      "566",
      "307",
      "714"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average lowest price of dishes listed in menus with more than 5 pages?",
    "code": "# Find menus with more than 5 pages\nlarge_menus = Menu[Menu['page_count'] > 5]\n\n# Find dishes that appear in these menus\nmenu_page_ids = MenuPage[MenuPage['menu_id'].isin(large_menus['id'])]['id']\nmenu_item_dishes = MenuItem[MenuItem['menu_page_id'].isin(menu_page_ids)]['dish_id']\n\n# Calculate average lowest price of these dishes\nrelevant_dishes = Dish[Dish['id'].isin(menu_item_dishes)]\naverage_lowest_price = relevant_dishes['lowest_price'].mean()\n\nprint(average_lowest_price)",
    "database": "menu",
    "choices": [
      "0.5660714285714286",
      "1.2349999999999999",
      "1.7916666666666667",
      "0.4218181818181818"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many unique venues are associated with dishes that have appeared over 50 times?",
    "code": "# Find dishes that appeared over 50 times\npopular_dishes = Dish[Dish['times_appeared'] > 50]['id']\n\n# Find menus containing these dishes\npopular_menu_item_ids = MenuItem[MenuItem['dish_id'].isin(popular_dishes)]['menu_page_id']\npopular_menus = MenuPage[MenuPage['id'].isin(popular_menu_item_ids)]['menu_id']\n\n# Get unique venues from these menus\nunique_venues = Menu[Menu['id'].isin(popular_menus)]['venue'].nunique()\n\nprint(unique_venues)",
    "database": "menu",
    "choices": [
      "11",
      "6",
      "3",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "How many dishes first appeared before 1920 and are associated with menus using \"Dollars\" as currency?",
    "code": "# Find dishes that first appeared before 1920\nearly_dishes = Dish[Dish['first_appeared'] < 1920]['id']\n\n# Find menus using \"Dollars\" as currency\ndollar_menus = Menu[Menu['currency'] == 'Dollars']['id']\n\n# Find these dishes in the dollar menus\ndollar_menu_page_ids = MenuPage[MenuPage['menu_id'].isin(dollar_menus)]['id']\nearly_dollar_dishes = MenuItem[(MenuItem['menu_page_id'].isin(dollar_menu_page_ids)) & (MenuItem['dish_id'].isin(early_dishes))]['dish_id'].nunique()\n\nprint(early_dollar_dishes)",
    "database": "menu",
    "choices": [
      "68",
      "139",
      "245",
      "34"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many menus feature dishes that were last appeared after 1950?",
    "code": "# Find dishes that last appeared after 1950\nrecent_dishes = Dish[Dish['last_appeared'] > 1950]['id']\n\n# Find menus featuring these dishes\nrecent_menu_item_ids = MenuItem[MenuItem['dish_id'].isin(recent_dishes)]['menu_page_id']\nrecent_menus = MenuPage[MenuPage['id'].isin(recent_menu_item_ids)]['menu_id'].nunique()\n\nprint(recent_menus)",
    "database": "menu",
    "choices": [
      "65",
      "130",
      "37",
      "246"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the average number of dishes per menu in the dataset?",
    "code": "average_dishes_per_menu = MenuItem['menu_page_id'].map(MenuPage.set_index('id')['menu_id']).nunique() / len(Menu)\nprint(average_dishes_per_menu)",
    "database": "menu",
    "choices": [
      "1.0122448979591836",
      "1.0061349693251533",
      "1.016260162601626",
      "1.0158730158730158"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "Which dish has appeared on the most number of distinct menus?",
    "code": "dish_menu_count = MenuItem.groupby('dish_id')['menu_page_id'].nunique().idxmax()\ndish_name = Dish.loc[Dish['id'] == dish_menu_count, 'name'].values[0]\nprint(dish_name)",
    "database": "menu",
    "choices": [
      "Gruyere",
      "Celery",
      "Coffee",
      "St. Julien"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "Which menu had the highest total number of dishes listed?",
    "code": "menu_with_most_dishes = Menu.loc[Menu['id'] == MenuItem['menu_page_id'].map(MenuPage.set_index('id')['menu_id']).value_counts().idxmax(), 'location'].values[0]\nprint(menu_with_most_dishes)",
    "database": "menu",
    "choices": [
      "Walton's Old Homestead Oyster and Chop House",
      "Cavanagh's",
      "Waldorf Astoria",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the range of menu page sizes (i.e., area in square centimeters) across all menus?",
    "code": "# Calculate the area of each menu page\nMenuPage['area'] = MenuPage['full_height'] * MenuPage['full_width']\n\n# Determine the range of menu page sizes\npage_size_range = MenuPage['area'].max() - MenuPage['area'].min()\n\nprint(page_size_range)",
    "database": "menu",
    "choices": [
      "38115414",
      "35419722",
      "46570301",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average net worth of actors who appear in movies with a runtime greater than 120 minutes?",
    "code": "merged_data = characters.merge(movie, on='MovieID').merge(actor, on='ActorID')\nfiltered_data = merged_data[merged_data['Runtime'] > 120]\navg_net_worth = filtered_data['NetWorth'].replace('', 0).replace('nan', 0).str.replace(',', '').str.replace('$', '').astype(float).mean()\nprint(avg_net_worth)",
    "database": "movie",
    "choices": [
      "46086666.666666664",
      "45179487.179487176",
      "66681818.18181818",
      "54086619.71830986"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total gross of all movies wherein actors with unknown height have participated?",
    "code": "actor_unknown_height = actor[actor['Height (Inches)'].isna()]\nmerged_data = characters.merge(movie, on='MovieID').merge(actor_unknown_height, on='ActorID')\ntotal_gross = merged_data['Gross'].sum()\nprint(total_gross)",
    "database": "movie",
    "choices": [
      "4904138342",
      "26082024821",
      "15378834643",
      "8509528602"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "What is the gross revenue of movies in which actors with a net worth of $100,000,000 or more appeared?",
    "code": "merged_data = characters.merge(actor, on='ActorID').merge(movie, on='MovieID')\nhigh_networth_revenue = merged_data[merged_data['NetWorth'] == '$100,000,000.00']['Gross'].sum()\nprint(high_networth_revenue)",
    "database": "movie",
    "choices": [
      "733554923",
      "418994745",
      "2852653464",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "Which movie has the highest rating among those that featured actors from the USA?",
    "code": "merged_data = characters.merge(actor, on='ActorID').merge(movie, on='MovieID')\nus_actors_movies = merged_data[merged_data['Birth Country'] == 'USA']\nhighest_rated_movie_title = us_actors_movies.loc[us_actors_movies['Rating'].idxmax(), 'Title']\nprint(highest_rated_movie_title)",
    "database": "movie",
    "choices": [
      "Forrest Gump",
      "Inception",
      "The Dark Knight",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average runtime of movies that include characters played by actors born in the UK?",
    "code": "merged_data = characters.merge(actor, on='ActorID').merge(movie, on='MovieID')\nuk_actors_movies = merged_data[merged_data['Birth Country'] == 'UK']\naverage_runtime = uk_actors_movies['Runtime'].mean()\nprint(average_runtime)",
    "database": "movie",
    "choices": [
      "137.38888888888889",
      "134.265625",
      "138.44444444444446",
      "136.15384615384616"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "How many unique genres are there in movies where actors of more than 68 inches in height played a role?",
    "code": "merged_data = characters.merge(actor, on='ActorID').merge(movie, on='MovieID')\ntall_actors_movies_genres = merged_data[merged_data['Height (Inches)'] > 68]['Genre'].nunique()\nprint(tall_actors_movies_genres)",
    "database": "movie",
    "choices": [
      "10",
      "12",
      "16",
      "14"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many unique actors appeared in movies with a budget over $100,000,000?",
    "code": "# Find movies with a budget over $100,000,000\nhigh_budget_movies = movie[movie['Budget'] > 100_000_000]['MovieID']\n\n# Find all actors who appeared in these movies\nactors_in_high_budget_movies = characters[characters['MovieID'].isin(high_budget_movies)]['ActorID'].unique()\n\n# Count the number of unique actors\nunique_actors_count = len(actors_in_high_budget_movies)\n\nprint(unique_actors_count)",
    "database": "movie",
    "choices": [
      "45",
      "160",
      "25",
      "84"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "How many unique actors have appeared in movies with a PG rating in this dataset?",
    "code": "# Getting movies with a PG rating\npg_movies = movie[movie['MPAA Rating'] == 'PG']\n\n# Extracting unique actor IDs who appeared in PG-rated movies\nunique_actors_pg = characters[characters['MovieID'].isin(pg_movies['MovieID'])]['ActorID'].nunique()\nprint(unique_actors_pg)",
    "database": "movie",
    "choices": [
      "11",
      "27",
      "63",
      "129"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the average rating for movies directed by directors with a quality score of 4?",
    "code": "# Merge the movies2directors and directors tables to find directors with quality 4\nqualified_directors = movies2directors.merge(directors[directors['d_quality'] == 4], on='directorid')\n\n# Merge with u2base to map movie ratings\ndirector_movie_ratings = u2base.merge(qualified_directors, on='movieid')\n\n# Calculate the average rating\naverage_rating = director_movie_ratings['rating'].mean()\n\nprint(average_rating)",
    "database": "movielens",
    "choices": [
      "94025650341.84616",
      "1.0835731596878058e+118",
      "inf",
      "3.039252006009009e+34"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many unique users rated movies that feature actors with a quality score of 3 or higher?",
    "code": "# Merge movies2actors with actors to filter actors with quality 3 or higher\nqualified_actors = movies2actors.merge(actors[actors['a_quality'] >= 3], on='actorid')\n\n# Merge with u2base to find users who rated these movies\nuser_ratings = u2base.merge(qualified_actors, on='movieid')\n\n# Count unique users\nunique_users = user_ratings['userid'].nunique()\n\nprint(unique_users)",
    "database": "movielens",
    "choices": [
      "505",
      "50",
      "20",
      "155"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many movies involve both a male and female actor in the cast?",
    "code": "# Merge movies2actors with actors to bring in actor genders\nmovies_with_genders = movies2actors.merge(actors, on='actorid')[['movieid', 'a_gender']]\n\n# Pivot to check for the presence of both genders per movie\ngender_pivot = movies_with_genders.pivot_table(index='movieid', columns='a_gender', aggfunc='size', fill_value=0)\n\n# Filter movies with at least one male and one female\nqualified_movies = gender_pivot[(gender_pivot.get('M', 0) > 0) & (gender_pivot.get('F', 0) > 0)]\n\n# Count these movies\nmovies_count = len(qualified_movies)\n\nprint(movies_count)",
    "database": "movielens",
    "choices": [
      "113",
      "1",
      "11",
      "34"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average running time of English movies rated by female users?",
    "code": "# Filter female users\nfemale_users = users[users['u_gender'] == 'F']\n\n# Merge with u2base to get movies rated by female users\nfemale_rated_movies = u2base.merge(female_users, on='userid')\n\n# Merge with movies to find running time of English movies\nenglish_movies = movies[movies['isEnglish'] == 'T']\nrunning_times = female_rated_movies.merge(english_movies, on='movieid')['runningtime']\n\n# Calculate the average running time\naverage_running_time = running_times.mean()\n\nprint(average_running_time)",
    "database": "movielens",
    "choices": [
      "3.0",
      "2.40625",
      "2.4444444444444446",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique directors have directed movies featuring actors with a quality rating of 4?",
    "code": "# Filter actors with a quality rating of 4\nquality_actors = actors[actors['a_quality'] == 4]\n\n# Merge movies2actors with quality_actors to find such movies\nmovies_with_quality_actors = movies2actors[movies2actors['actorid'].isin(quality_actors['actorid'])]\n\n# Merge with movies2directors to find directors of such movies\ndirectors_of_quality_actor_movies = movies2directors[movies2directors['movieid'].isin(movies_with_quality_actors['movieid'])]\n\n# Find the number of unique directors\nunique_directors_count = directors_of_quality_actor_movies['directorid'].nunique()\nprint(unique_directors_count)",
    "database": "movielens",
    "choices": [
      "6",
      "55",
      "22",
      "185"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average rating given by users for movies directed by directors with d_quality 4?",
    "code": "# Merge tables to get director's d_quality\nmovies_with_director = movies2directors.merge(directors, on='directorid')\n# Filter movies with directors having d_quality 4\nhigh_quality_movie_ids = movies_with_director[movies_with_director['d_quality'] == 4]['movieid']\n# Filter ratings for those movies\nhigh_quality_ratings = u2base[u2base['movieid'].isin(high_quality_movie_ids)]['rating']\n# Calculate average rating\naverage_rating = high_quality_ratings.mean()\nprint(average_rating)",
    "database": "movielens",
    "choices": [
      "3.1172839814815124e+33",
      "inf",
      "94025649572.6923",
      "9.497625830959164e+113"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the average running time of USA movies that have at least one actor with a_quality 3?",
    "code": "# Find all actors with a_quality 3\nhigh_quality_actors = actors[actors['a_quality'] == 3]['actorid']\n# Find all movies with at least one high-quality actor\nmovies_with_high_quality_actors = movies2actors[movies2actors['actorid'].isin(high_quality_actors)]['movieid']\n# Filter movies from the USA with these actors\nusa_movies = movies[(movies['country'] == 'USA') & (movies['movieid'].isin(movies_with_high_quality_actors))]\n# Calculate the average running time\naverage_running_time = usa_movies['runningtime'].mean()\nprint(average_running_time)",
    "database": "movielens",
    "choices": [
      "2.206896551724138",
      "2.0773809523809526",
      "2.121311475409836",
      "2.0892531876138434"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "Which genre has the highest average budget for movies in which it's featured?",
    "code": "# Merge the necessary tables to find out genres and their corresponding budgets\nmovies_with_genres = movie.merge(movie_genres, on='movie_id', how='inner')\nmovies_with_budgets_and_genres = movies_with_genres.merge(genre, on='genre_id', how='inner')\n\n# Calculate the average budget per genre\naverage_budget_per_genre = movies_with_budgets_and_genres.groupby('genre_name')['budget'].mean()\n\n# Find the genre with the highest average budget\nhighest_avg_budget_genre = average_budget_per_genre.idxmax()\n\nprint(highest_avg_budget_genre)",
    "database": "movies_4",
    "choices": [
      "Science Fiction",
      "Fantasy",
      "Adventure",
      "Animation"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "What is the name of the highest revenue movie produced in the United States?",
    "code": "# Merge necessary tables to find movies produced in the United States\nus_movies = production_country.merge(country, on='country_id').query(\"country_name == 'United States of America'\")\nus_movies_with_revenue = us_movies.merge(movie, on='movie_id')\n\n# Get the movie with the highest revenue produced in the US\nhighest_revenue_us_movie = us_movies_with_revenue.loc[us_movies_with_revenue['revenue'].idxmax()]['title']\n\nprint(highest_revenue_us_movie)",
    "database": "movies_4",
    "choices": [
      "Cradle Will Rock",
      "Minions",
      "Analyze This",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which person has worked as an Executive Producer on the highest number of movies?",
    "code": "# Merge movie_crew with the relevant details to find people who worked as Executive Producers\nexecutive_producers = movie_crew.merge(person, on='person_id', how='inner')\nexecutive_producers = executive_producers.query(\"job == 'Executive Producer'\")\n\n# Count the number of movies each Executive Producer has worked on\nep_movie_counts = executive_producers['person_name'].value_counts()\n\n# Identify the Executive Producer with the most movies\ntop_executive_producer = ep_movie_counts.idxmax()\n\nprint(top_executive_producer)",
    "database": "movies_4",
    "choices": [
      "Mike Medavoy",
      "Peter Levathes",
      "Chad Oman",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "What is the highest average vote rating for a movie genre across all countries?",
    "code": "movie_genres_with_votes = movie.merge(movie_genres, on='movie_id').merge(genre, on='genre_id')\naverage_votes_by_genre = movie_genres_with_votes.groupby('genre_name')['vote_average'].mean()\nhighest_average_vote = average_votes_by_genre.max()\nprint(highest_average_vote)",
    "database": "movies_4",
    "choices": [
      "7.7",
      "7.300000000000001",
      "7.166666666666667",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average runtime of movies produced in the United States?",
    "code": "us_id = country[country['country_name'] == 'United States of America']['country_id'].iloc[0]\nus_movies = production_country[production_country['country_id'] == us_id]['movie_id']\naverage_runtime = movie[movie['movie_id'].isin(us_movies)]['runtime'].mean()\nprint(average_runtime)",
    "database": "movies_4",
    "choices": [
      "117.5",
      "132.0",
      "114.36666666666666",
      "105.0"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average vote count for movies released in 2009 that are in the Drama genre?",
    "code": "drama_id = genre[genre['genre_name'] == 'Drama']['genre_id'].iloc[0]\ndrama_movies_2009 = movie[(movie['release_date'].str.startswith('2009')) & (movie['movie_id'].isin(movie_genres[movie_genres['genre_id'] == drama_id]['movie_id']))]\naverage_vote_count = drama_movies_2009['vote_count'].mean()\nprint(average_vote_count)",
    "database": "movies_4",
    "choices": [
      "576.0",
      "465.3333333333333",
      "65.0",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "How many unique genres are associated with movies produced in the United States?",
    "code": "# Merge the relevant tables to find the answer\nmerged_data = movie.merge(production_country, on='movie_id')\\\n                   .merge(movie_genres, on='movie_id')\\\n                   .merge(country, on='country_id')\n\n# Filter to only include movies produced in the United States\nus_movies = merged_data[merged_data['country_name'] == 'United States of America']\n\n# Count unique genres for these movies\nunique_genres_count = len(us_movies['genre_id'].unique())\n\nprint(unique_genres_count)",
    "database": "movies_4",
    "choices": [
      "12",
      "2",
      "1",
      "5"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the most common job title in the Crew department?",
    "code": "# Merge the relevant tables to find the answer\ncrew_data = movie_crew.merge(department, on='department_id')\n\n# Filter to only include entries for the Crew department\ncrew_department_jobs = crew_data[crew_data['department_name'] == 'Crew']['job']\n\n# Find the most common job title\nmost_common_job = crew_department_jobs.mode()[0]\n\nprint(most_common_job)",
    "database": "movies_4",
    "choices": [
      "Stunt Coordinator",
      "CG Supervisor",
      "Stunts",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many movies are associated with the production company 'Warner Bros.'?",
    "code": "# Merge necessary tables\ncompany_movies = movie_company.merge(production_company, on='company_id')\n\n# Filter movies associated with 'Warner Bros.'\nwarner_bros_movies = company_movies[company_movies['company_name'] == 'Warner Bros.']\n\n# Count the number of these movies\nwarner_bros_movies_count = warner_bros_movies['movie_id'].nunique()\n\nprint(warner_bros_movies_count)",
    "database": "movies_4",
    "choices": [
      "3",
      "9",
      "2",
      "4"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the average budget of movies produced in the United States?",
    "code": "us_movies = movie.merge(production_country[production_country['country_id'] == country[country['country_name'] == 'United States of America']['country_id'].iloc[0]], on='movie_id')\naverage_us_budget = us_movies['budget'].mean()\nprint(average_us_budget)",
    "database": "movies_4",
    "choices": [
      "29916666.666666668",
      "37662126.166666664",
      "40000000.0",
      "0.0"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the runtime of the longest-released movie associated with the Mystery genre?",
    "code": "mystery_movies = movie_genres[movie_genres['genre_id'] == genre[genre['genre_name'] == 'Mystery']['genre_id'].iloc[0]]\nlongest_mystery_movie_runtime = movie.merge(mystery_movies, on='movie_id').query(\"movie_status == 'Released'\")['runtime'].max()\nprint(longest_mystery_movie_runtime)",
    "database": "movies_4",
    "choices": [
      "98",
      "149",
      "118",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which artist is associated with the album that has the 'freely.available' tag?",
    "code": "freely_available_id = tags.loc[tags['tag'] == 'freely.available', 'id'].values\nartist_name = torrents.loc[torrents['id'].isin(freely_available_id), 'artist'].iloc[0]\nprint(artist_name)",
    "database": "music_tracker",
    "choices": [
      "topez kno",
      "timeflies",
      "peter rosenberg",
      "cfcf"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of snatched downloads for all torrents that have the \"beats\" tag?",
    "code": "tagged_id = tags[tags['tag'] == 'beats']['id'].values\ntotal_snatched_beats = torrents[torrents['id'].isin(tagged_id)]['totalSnatched'].sum()\nprint(total_snatched_beats)",
    "database": "music_tracker",
    "choices": [
      "3451",
      "664",
      "903",
      "2188"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "Which artist has the highest number of snatches associated with the tag \"electro\"?",
    "code": "electro_ids = tags[tags[\"tag\"] == \"electro\"][\"id\"]\nelectro_torrents = torrents[torrents[\"id\"].isin(electro_ids)]\nartist_snatches = electro_torrents.groupby(\"artist\")[\"totalSnatched\"].sum()\nmost_snatches_artist = artist_snatches.idxmax()\nprint(most_snatches_artist)",
    "database": "music_tracker",
    "choices": [
      "various djs",
      "toro y moi",
      "high fidelity three",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the earliest year a release with the tag \"alternative\" was released?",
    "code": "alternative_ids = tags[tags[\"tag\"] == \"alternative\"][\"id\"]\nalternative_torrents = torrents[torrents[\"id\"].isin(alternative_ids)]\nearliest_year = alternative_torrents[\"groupYear\"].min()\nprint(earliest_year)",
    "database": "music_tracker",
    "choices": [
      "1989",
      "2011",
      "1991",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many unique artists have a release that is tagged as \"freely.available\"?",
    "code": "freely_ids = tags[tags[\"tag\"] == \"freely.available\"][\"id\"]\nfreely_torrents = torrents[torrents[\"id\"].isin(freely_ids)]\nunique_artists_count = freely_torrents[\"artist\"].nunique()\nprint(unique_artists_count)",
    "database": "music_tracker",
    "choices": [
      "11",
      "62",
      "15",
      "29"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of snatched torrents associated with the tag 'rock'?",
    "code": "# Filtering tags DataFrame for 'rock', then merging with torrents\nrock_torrent_ids = tags[tags['tag'] == 'rock']['id']\nrock_torrents = torrents[torrents['id'].isin(rock_torrent_ids)]\ntotal_snatched_rock = rock_torrents['totalSnatched'].sum()\nprint(total_snatched_rock)",
    "database": "music_tracker",
    "choices": [
      "135",
      "6543",
      "1297",
      "12643"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique release types are there for torrents associated with the tag 'dance'?",
    "code": "# Filtering tags DataFrame for 'dance', merging with torrents, counting unique release types\ndance_torrent_ids = tags[tags['tag'] == 'dance']['id']\ndance_torrents = torrents[torrents['id'].isin(dance_torrent_ids)]\nunique_release_types = dance_torrents['releaseType'].nunique()\nprint(unique_release_types)",
    "database": "music_tracker",
    "choices": [
      "3",
      "4",
      "6",
      "5"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many people are from Canada?",
    "code": "# Find the region_id for Canada\ncanada_region_id = noc_region.loc[noc_region['region_name'] == \"Canada\", 'id'].values[0]\n\n# Count the number of people from Canada\ncanadian_person_count = person_region.loc[person_region['region_id'] == canada_region_id, 'person_id'].nunique()\nprint(canadian_person_count)",
    "database": "olympics",
    "choices": [
      "16",
      "31",
      "68",
      "4"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many competitors are associated with the 2004 Summer games?",
    "code": "game_id_2004_summer = games.query('games_year == 2004 and season == \"Summer\"')['id'].iloc[0]\ncompetitor_count = games_competitor.query('games_id == @game_id_2004_summer').shape[0]\nprint(competitor_count)",
    "database": "olympics",
    "choices": [
      "49",
      "106",
      "25",
      "12"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "How many unique sports have events associated with a medal of NA?",
    "code": "na_medal_id = medal.query('medal_name == \"NA\"')['id'].iloc[0]\nevent_ids = competitor_event.query('medal_id == @na_medal_id')['event_id'].unique()\nsport_ids = event.query('id in @event_ids')['sport_id'].unique()\nunique_sports_count = len(sport_ids)\nprint(unique_sports_count)",
    "database": "olympics",
    "choices": [
      "35",
      "45",
      "49",
      "53"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many female competitors participated in the 2010 Winter games?",
    "code": "winter_2010_games_id = games[(games['games_year'] == 2010) & (games['season'] == 'Winter')]['id'].values[0]\nwinter_2010_competitors = games_competitor[games_competitor['games_id'] == winter_2010_games_id]\nfemale_competitors_ids = person[person['gender'] == 'F']['id']\nfemale_winter_2010_count = len(winter_2010_competitors[winter_2010_competitors['person_id'].isin(female_competitors_ids)])\nprint(female_winter_2010_count)",
    "database": "olympics",
    "choices": [
      "22",
      "4",
      "10",
      "13"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many users who have been Yelping since 2008 are considered elites in any available year?",
    "code": "# Get users yelping since 2008\nusers_since_2008 = Users[Users['user_yelping_since_year'] == 2008]['user_id']\n\n# Find elite users among those users\nelite_users_since_2008 = Elite[Elite['user_id'].isin(users_since_2008)]['user_id'].nunique()\n\nprint(elite_users_since_2008)",
    "database": "public_review_platform",
    "choices": [
      "29",
      "15",
      "58",
      "105"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average number of tips received by businesses that are active and have a review_count labeled as 'High'?",
    "code": "# Filter businesses for active and review_count=='High'\nhigh_review_count_active_business = Business[(Business['active'] == 'true') & (Business['review_count'] == 'High')]['business_id']\n\n# Count the number of tips for these businesses\ntips_count_for_businesses = Tips[Tips['business_id'].isin(high_review_count_active_business)]['business_id'].count()\n\n# Calculate average number of tips\naverage_tips = tips_count_for_businesses / len(high_review_count_active_business)\nprint(average_tips)",
    "database": "public_review_platform",
    "choices": [
      "0.05263157894736842",
      "0.0",
      "0.05555555555555555",
      "0.21875"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many elite users in the year 2012 have been active since 2010 or earlier?",
    "code": "# Find users who became elite in 2012\nelite_2012_user_ids = Elite[Elite['year_id'] == Years[Years['actual_year'] == 2012]['year_id'].values[0]]['user_id']\n# Filter these elite users by their yelping start year\neligible_users = Users[(Users['user_id'].isin(elite_2012_user_ids)) & (Users['user_yelping_since_year'] <= 2010)]\n# Count how many users meet the criteria\ncount_elite_users_2012 = len(eligible_users)\nprint(count_elite_users_2012)",
    "database": "public_review_platform",
    "choices": [
      "12",
      "83",
      "22",
      "38"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of reviews that received a \"Low\" rating for funny votes?",
    "code": "# Filter reviews that have \"Low\" funny votes\nlow_funny_reviews_count = Reviews[Reviews['review_votes_funny'] == 'Low'].shape[0]\nprint(low_funny_reviews_count)",
    "database": "public_review_platform",
    "choices": [
      "151",
      "19",
      "35",
      "79"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many check-ins occurred on Sundays across all businesses?",
    "code": "# Get the day_id for Sunday\nsunday_id = Days[Days['day_of_week'] == 'Sunday']['day_id'].values[0]\n# Filter check-ins for Sundays and sum the non-null check-ins\nsunday_checkins_count = Checkins[Checkins['day_id'] == sunday_id].iloc[:, 2:].notna().sum().sum()\nprint(sunday_checkins_count)",
    "database": "public_review_platform",
    "choices": [
      "264",
      "144",
      "552",
      "1608"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average household income in regions where Weimei Corp has made purchases?",
    "code": "# Filter Sales_Orders for purchases by Weimei Corp\nweimei_sales = Sales_Orders[Sales_Orders['_CustomerID'] == Customers[Customers['Customer Names'] == 'Weimei Corp']['CustomerID'].values[0]]\n\n# Get distinct store IDs from these sales and merge with Store_Locations to get household income\nweimei_store_locations = Store_Locations[Store_Locations['StoreID'].isin(weimei_sales['_StoreID'].unique())]\n\n# Calculate average household income\naverage_household_income = weimei_store_locations['Household Income'].mean()\n\nprint(average_household_income)",
    "database": "regional_sales",
    "choices": [
      "63256.0",
      "61638.5",
      "65733.0",
      "62877.181818181816"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique customers have placed orders through the \"Online\" sales channel?",
    "code": "# Filter Sales_Orders for the \"Online\" sales channel and get unique customer IDs\nonline_customers = Sales_Orders[Sales_Orders['Sales Channel'] == 'Online']['_CustomerID'].unique()\n\n# Count the number of unique customer IDs\nnum_unique_customers_online = len(online_customers)\n\nprint(num_unique_customers_online)",
    "database": "regional_sales",
    "choices": [
      "34",
      "50",
      "20",
      "46"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "Which customer had the highest total order quantity across all sales orders?",
    "code": "total_order_quantity = Sales_Orders.groupby('_CustomerID')['Order Quantity'].sum()\nhighest_quantity_customer_id = total_order_quantity.idxmax()\nhighest_quantity_customer_name = Customers.loc[Customers['CustomerID'] == highest_quantity_customer_id, 'Customer Names'].values[0]\nprint(highest_quantity_customer_name)",
    "database": "regional_sales",
    "choices": [
      "ETUDE Ltd",
      "Qualitest ",
      "OHTA'S Corp",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the most frequently ordered product in the Midwest region?",
    "code": "midwest_region_store_ids = Store_Locations[Store_Locations['StateCode'].isin(Regions[Regions['Region'] == 'Midwest']['StateCode'])]['StoreID']\nmidwest_region_orders = Sales_Orders[Sales_Orders['_StoreID'].isin(midwest_region_store_ids)]\nmost_frequent_product_id = midwest_region_orders['_ProductID'].mode()[0]\nmost_frequent_product_name = Products.loc[Products['ProductID'] == most_frequent_product_id, 'Product Name'].values[0]\nprint(most_frequent_product_name)",
    "database": "regional_sales",
    "choices": [
      "Pillows",
      "Accessories",
      "Furniture Cushions",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average order quantity for orders delivered in California?",
    "code": "merged_df = Sales_Orders.merge(Store_Locations, left_on='_StoreID', right_on='StoreID')\nca_orders = merged_df[merged_df['State'] == 'California']\naverage_order_quantity_ca = ca_orders['Order Quantity'].mean()\nprint(average_order_quantity_ca)",
    "database": "regional_sales",
    "choices": [
      "3.972972972972973",
      "4.093023255813954",
      "3.95",
      "4.012820512820513"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "How many sales orders were placed for products categorized as 'Wall Frames'?",
    "code": "wall_frames_product_id = Products[Products['Product Name'] == 'Wall Frames']['ProductID'].values[0]\nwall_frames_orders_count = Sales_Orders[Sales_Orders['_ProductID'] == wall_frames_product_id].shape[0]\nprint(wall_frames_orders_count)",
    "database": "regional_sales",
    "choices": [
      "2",
      "3",
      "16",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "What is the average household income of the locations where Ascend Ltd has made purchases?",
    "code": "merged_data = Sales_Orders.merge(Customers, left_on='_CustomerID', right_on='CustomerID')\nascend_orders = merged_data[merged_data['Customer Names'] == 'Ascend Ltd']\nstore_locations = ascend_orders.merge(Store_Locations, left_on='_StoreID', right_on='StoreID')\naverage_income = store_locations['Household Income'].mean()\nprint(average_income)",
    "database": "regional_sales",
    "choices": [
      "92183.6",
      "80539.5",
      "44144.666666666664",
      "72656.0"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the name of the sales team that handled the most orders?",
    "code": "# Merging Sales_Orders with Sales_Team on `_SalesTeamID`\nsales_with_team = Sales_Orders.merge(Sales_Team, left_on='_SalesTeamID', right_on='SalesTeamID')\n\n# Counting orders handled by each sales team and finding the team with the most orders\nsales_team_most_orders = sales_with_team['Sales Team'].value_counts().idxmax()\n\nprint(sales_team_most_orders)",
    "database": "regional_sales",
    "choices": [
      "George Lewis",
      "Samuel Fowler",
      "Anthony Berry",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average discount applied on orders from the 'Midwest' region?",
    "code": "# Merging Sales_Orders with Sales_Team and Regions to get the region information\nsales_with_region = Sales_Orders.merge(Sales_Team[['SalesTeamID', 'Region']], left_on='_SalesTeamID', right_on='SalesTeamID')\nsales_midwest = sales_with_region[sales_with_region['Region'] == 'Midwest']\n\n# Calculating the average discount for orders in the 'Midwest' region\naverage_discount_midwest = sales_midwest['Discount Applied'].mean()\n\nprint(average_discount_midwest)",
    "database": "regional_sales",
    "choices": [
      "0.1265625",
      "0.10969387755102043",
      "0.09886363636363638",
      "0.12280219780219782"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "Which customer has placed the largest order by quantity?",
    "code": "# Merging Sales_Orders with Customers on `_CustomerID`\nsales_with_customer = Sales_Orders.merge(Customers, left_on='_CustomerID', right_on='CustomerID')\n\n# Finding the order with the largest order quantity\nlargest_order_customer = sales_with_customer.loc[sales_with_customer['Order Quantity'].idxmax(), 'Customer Names']\n\nprint(largest_order_customer)",
    "database": "regional_sales",
    "choices": [
      "OHTA'S Corp",
      "E. Ltd",
      "Prasco Group",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "How many cities have restaurants with a review score of 2?",
    "code": "review_score_2_cities = generalinfo[generalinfo['review'] == 2]['city'].nunique()\nprint(review_score_2_cities)",
    "database": "restaurant",
    "choices": [
      "96",
      "59",
      "76",
      "109"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many restaurants are located in the Bay Area?",
    "code": "bay_area_cities = geographic[geographic['region'] == 'bay area']['city']\nbay_area_restaurants = generalinfo[generalinfo['city'].isin(bay_area_cities)]\nnum_bay_area_restaurants = bay_area_restaurants['id_restaurant'].nunique()\nprint(num_bay_area_restaurants)",
    "database": "restaurant",
    "choices": [
      "494",
      "1983",
      "988",
      "243"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "Which city has the highest average review score for its restaurants?",
    "code": "# Group by city and calculate the average review score\naverage_reviews_by_city = generalinfo.groupby('city')['review'].mean()\n# Find the city with the highest average review score\ncity_with_highest_avg_review = average_reviews_by_city.idxmax()\nprint(city_with_highest_avg_review)",
    "database": "restaurant",
    "choices": [
      "kenwood",
      "monterey",
      "cotati",
      "rohnert park"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many unique counties are represented in the dataset?",
    "code": "# Get unique counties from the geographic table\nunique_counties = geographic['county'].nunique()\nprint(unique_counties)",
    "database": "restaurant",
    "choices": [
      "13",
      "16",
      "17",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of unique food types offered by restaurants across all cities?",
    "code": "unique_food_types = generalinfo['food_type'].nunique()\nprint(unique_food_types)",
    "database": "restaurant",
    "choices": [
      "71",
      "43",
      "92",
      "58"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "How many complaints were received for clients living in the 'Northeast' region?",
    "code": "northeast_states = state[state['Region'] == 'Northeast']['StateCode']\nnortheast_clients = client[client['state'].isin(northeast_states)]['client_id']\nnortheast_complaints = events[events['Client_ID'].isin(northeast_clients)]\nnortheast_complaints_count = len(northeast_complaints)\nprint(northeast_complaints_count)",
    "database": "retail_complains",
    "choices": [
      "12",
      "27",
      "10",
      "5"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "How many unique products have been reviewed in the 'Midwest' region?",
    "code": "midwest_states = state[state['Region'] == 'Midwest']['StateCode']\nmidwest_districts = district[district['state_abbrev'].isin(midwest_states)]['district_id']\nmidwest_reviews = reviews[reviews['district_id'].isin(midwest_districts)]\nunique_products_count = midwest_reviews['Product'].nunique()\nprint(unique_products_count)",
    "database": "retail_complains",
    "choices": [
      "3",
      "2",
      "1",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of complaints from clients in the region where the district ID 25 is located?",
    "code": "# Find the state abbreviation for district_id 25\nstate_abbrev = district[district['district_id'] == 25]['state_abbrev'].values[0]\n\n# Find the region for this state\nregion = state[state['StateCode'] == state_abbrev]['Region'].values[0]\n\n# Find all clients in the region\nclients_in_region = client[client['state'].isin(state[state['Region'] == region]['StateCode'])]['client_id']\n\n# Count the number of complaints from these clients\ncomplaints_count = events[events['Client_ID'].isin(clients_in_region)].shape[0]\n\nprint(complaints_count)",
    "database": "retail_complains",
    "choices": [
      "9",
      "1",
      "2",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many unique clients have had events related to a credit card issue?",
    "code": "unique_clients_with_credit_card_issue = events[events['Product'] == 'Credit card']['Client_ID'].nunique()\nprint(unique_clients_with_credit_card_issue)",
    "database": "retail_complains",
    "choices": [
      "98",
      "25",
      "74",
      "209"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average number of complaints per state?",
    "code": "average_complaints_per_state = events.merge(client, left_on='Client_ID', right_on='client_id').groupby('state').size().mean()\nprint(average_complaints_per_state)",
    "database": "retail_complains",
    "choices": [
      "1.6666666666666667",
      "1.5454545454545454",
      "2.0434782608695654",
      "1.5"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many complaints were resolved timely with explanation from clients in the \"Northeast\" region?",
    "code": "# Merging client, district, and state tables\ndistrict_state = district.merge(state, left_on=\"state_abbrev\", right_on=\"StateCode\")\nclient_district_state = client.merge(district_state, left_on=\"district_id\", right_on=\"district_id\")\n\n# Merging with events table\nclient_events = client_district_state.merge(events, left_on=\"client_id\", right_on=\"Client_ID\")\n\n# Filtering for timely responses with explanation and region Northeast\nresolved_timely = client_events[(client_events[\"Timely response?\"] == \"Yes\") &\n                                (client_events[\"Company response to consumer\"] == \"Closed with explanation\") &\n                                (client_events[\"Region\"] == \"Northeast\")]\n\nresolved_count = resolved_timely.shape[0]\n\nprint(resolved_count)",
    "database": "retail_complains",
    "choices": [
      "19",
      "4",
      "8",
      "9"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many unique clients have both filed complaints and had interactions recorded in the call center logs?",
    "code": "# Clients with complaints\nclients_with_complaints = events[\"Client_ID\"].unique()\n\n# Clients with calls\nclients_with_calls = callcenterlogs[\"rand client\"].unique()\n\n# Finding intersection of clients with complaints and calls\nunique_clients = len(set(clients_with_complaints) & set(clients_with_calls))\n\nprint(unique_clients)",
    "database": "retail_complains",
    "choices": [
      "19",
      "6",
      "16",
      "43"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average age of clients who made calls to the call center?",
    "code": "# Find clients who made calls to the call center using 'rand client' in 'callcenterlogs'\nclients_with_calls = callcenterlogs['rand client'].unique()\n# Get ages of clients who made calls\nages_of_calling_clients = client[client['client_id'].isin(clients_with_calls)]['age'].mean()\nprint(ages_of_calling_clients)",
    "database": "retail_complains",
    "choices": [
      "51.46511627906977",
      "53.0",
      "47.666666666666664",
      "55.36842105263158"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of unique suppliers that provide parts in the 'EUROPE' region?",
    "code": "merged = partsupp.merge(supplier, left_on='ps_suppkey', right_on='s_suppkey')\nmerged = merged.merge(nation, left_on='s_nationkey', right_on='n_nationkey')\nmerged = merged.merge(region, left_on='n_regionkey', right_on='r_regionkey')\nunique_suppliers_europe = merged[merged['r_name'] == 'EUROPE']['ps_suppkey'].nunique()\nprint(unique_suppliers_europe)",
    "database": "retails",
    "choices": [
      "7",
      "17",
      "51",
      "104"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the average retail price of parts that were ordered by customers in the 'HOUSEHOLD' market segment?",
    "code": "merged = orders.merge(customer, left_on='o_custkey', right_on='c_custkey')\nhousehold_orders = merged[merged['c_mktsegment'] == 'HOUSEHOLD']['o_orderkey']\nlineitem_ordered = lineitem[lineitem['l_orderkey'].isin(household_orders)]\nmerged_lineitem_part = lineitem_ordered.merge(part, left_on='l_partkey', right_on='p_partkey')\naverage_retail_price = merged_lineitem_part['p_retailprice'].mean()\nprint(average_retail_price)",
    "database": "retails",
    "choices": [
      "1476.589",
      "1340.2591666666665",
      "1502.3400000000001",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "How many different parts have been shipped with 'FOB' shipping mode?",
    "code": "fob_shipments = lineitem[lineitem['l_shipmode'] == 'FOB']\nunique_parts_shipped_FOB = fob_shipments['l_partkey'].nunique()\nprint(unique_parts_shipped_FOB)",
    "database": "retails",
    "choices": [
      "5",
      "17",
      "9",
      "10"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the total account balance of customers in 'ASIA'?",
    "code": "merged = customer.merge(nation, left_on='c_nationkey', right_on='n_nationkey')\nmerged = merged.merge(region, left_on='n_regionkey', right_on='r_regionkey')\ntotal_acctbal_asia = merged[merged['r_name'] == 'ASIA']['c_acctbal'].sum()\nprint(total_acctbal_asia)",
    "database": "retails",
    "choices": [
      "20378.96",
      "10638.43",
      "57065.7",
      "107740.35"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the maximum discount given on any 'FURNITURE' segment order?",
    "code": "merged = orders.merge(customer, left_on='o_custkey', right_on='c_custkey')\nfurniture_orders = merged[merged['c_mktsegment'] == 'FURNITURE']['o_orderkey']\nlineitem_furniture_orders = lineitem[lineitem['l_orderkey'].isin(furniture_orders)]\nmax_discount_furniture = lineitem_furniture_orders['l_discount'].max()\nprint(max_discount_furniture)",
    "database": "retails",
    "choices": [
      "0.09",
      "0.1",
      "0.08",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of unique parts that have been ordered across all orders?",
    "code": "unique_parts = len(lineitem['l_partkey'].unique())\nprint(unique_parts)",
    "database": "retails",
    "choices": [
      "128",
      "32",
      "19",
      "64"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "Which nation has the highest average customer account balance?",
    "code": "merged_df = customer.merge(nation, left_on='c_nationkey', right_on='n_nationkey')\naverage_acctbal_per_nation = merged_df.groupby('n_name')['c_acctbal'].mean()\nnation_with_highest_acctbal = average_acctbal_per_nation.idxmax()\nprint(nation_with_highest_acctbal)",
    "database": "retails",
    "choices": [
      "BRAZIL",
      "ARGENTINA",
      "UNITED KINGDOM",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the total retail price of parts supplied by suppliers in the EUROPE region?",
    "code": "import pandas as pd\n\n# Merge nation and region tables to get region names\nnation_region = pd.merge(nation, region, left_on='n_regionkey', right_on='r_regionkey')\n# Filter for the EUROPE region\neurope_nations = nation_region[nation_region['r_name'] == 'EUROPE']\n\n# Merge supplier and filtered nations to get suppliers in EUROPE\neurope_suppliers = pd.merge(supplier, europe_nations, left_on='s_nationkey', right_on='n_nationkey')\n\n# Merge part, partsupp, and euro suppliers to get relevant parts\npart_supplier = pd.merge(pd.merge(part, partsupp, left_on='p_partkey', right_on='ps_partkey'), europe_suppliers, left_on='ps_suppkey', right_on='s_suppkey')\n\n# Calculate the total retail price of parts\ntotal_retail_price_europe = part_supplier['p_retailprice'].sum()\nprint(total_retail_price_europe)",
    "database": "retails",
    "choices": [
      "10513.48",
      "24480.739999999998",
      "157029.45999999996",
      "72636.4"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the average supply cost for parts supplied by suppliers in the EUROPE region?",
    "code": "europe_suppliers = supplier[supplier['s_nationkey'].isin(nation[nation['n_regionkey'] == region[region['r_name'] == 'EUROPE'].iat[0,0]]['n_nationkey'])]\neurope_partsupp = partsupp[partsupp['ps_suppkey'].isin(europe_suppliers['s_suppkey'])]\naverage_supply_cost = europe_partsupp['ps_supplycost'].mean()\nprint(average_supply_cost)",
    "database": "retails",
    "choices": [
      "525.8394117647058",
      "507.8846296296296",
      "579.7014285714287",
      "531.7503921568626"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the total revenue generated from all sales?",
    "code": "# Merge the Sales DataFrame with the Products DataFrame to access price information for each product sold\nmerged_sales_products = Sales.merge(Products, on='ProductID')\n\n# Calculate revenue for each sale\nmerged_sales_products['Revenue'] = merged_sales_products['Price'] * merged_sales_products['Quantity']\n\n# Calculate the total revenue\ntotal_revenue = merged_sales_products['Revenue'].sum()\n\nprint(total_revenue)",
    "database": "sales",
    "choices": [
      "1689140227.53",
      "557068567.4250001",
      "274185752.595",
      "136542590.145"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "Which customer purchased the most expensive product?",
    "code": "# Merge the Sales DataFrame with the Products DataFrame to access price information for each product sold\nmerged_sales_products = Sales.merge(Products, on='ProductID')\n\n# Identify the most expensive product sold in any single transaction\nmost_expensive_sale = merged_sales_products.loc[merged_sales_products['Price'].idxmax()]\n\n# Find the CustomerID for this sale\ncustomer_id = most_expensive_sale['CustomerID']\n\n# Find the first name of the customer\ncustomer_name = Customers.loc[Customers['CustomerID'] == customer_id, 'FirstName'].values[0]\n\nprint(customer_name)",
    "database": "sales",
    "choices": [
      "Dalton",
      "Nathaniel",
      "Andre",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the name of the product that has been sold the most times?",
    "code": "# Group the Sales data by ProductID and count the number of times each product has been sold\nproduct_sales_count = Sales['ProductID'].value_counts()\n\n# Get the ProductID of the most frequently sold product\nmost_frequent_product_id = product_sales_count.idxmax()\n\n# Find the name of that product\nmost_frequent_product_name = Products.loc[Products['ProductID'] == most_frequent_product_id, 'Name'].values[0]\n\nprint(most_frequent_product_name)",
    "database": "sales",
    "choices": [
      "Hex Nut 16",
      "LL Road Seat/Saddle",
      "Lock Nut 22",
      "Internal Lock Washer 9"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "Which product generated the most revenue?",
    "code": "# Merge the Sales DataFrame with the Products DataFrame to access price information for each product sold\nmerged_sales_products = Sales.merge(Products, on='ProductID')\n\n# Calculate revenue for each sale\nmerged_sales_products['Revenue'] = merged_sales_products['Price'] * merged_sales_products['Quantity']\n\n# Group by product and sum the revenue\nproduct_revenue = merged_sales_products.groupby('ProductID')['Revenue'].sum()\n\n# Get the ProductID of the product with the maximum revenue\ntop_product_id = product_revenue.idxmax()\n\n# Find the name of the product\ntop_product_name = Products.loc[Products['ProductID'] == top_product_id, 'Name'].values[0]\n\nprint(top_product_name)",
    "database": "sales",
    "choices": [
      "Mountain-100 Silver, 38",
      "Road-250 Red, 48",
      "Mountain-100 Silver, 44",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "Which customer has purchased the highest quantity of products?",
    "code": "# Group Sales by CustomerID and sum the Quantity\ntotal_quantity_per_customer = Sales.groupby('CustomerID')['Quantity'].sum()\n\n# Identify the CustomerID with the highest quantity\nmax_quantity_customer_id = total_quantity_per_customer.idxmax()\n\n# Find the Customer name with the max CustomerID\nmax_quantity_customer_name = Customers[Customers['CustomerID'] == max_quantity_customer_id]\n\n# Get the first name and last name\ncustomer_full_name = f\"{max_quantity_customer_name['FirstName'].values[0]} {max_quantity_customer_name['LastName'].values[0]}\"\n\nprint(customer_full_name)",
    "database": "sales",
    "choices": [
      "Kristi Saunders",
      "Seth Perry",
      "Sheena Lal",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "What is the average purchase quantity per customer?",
    "code": "# Group Sales by CustomerID, calculate mean of quantity\naverage_quantity_per_customer = Sales.groupby('CustomerID')['Quantity'].mean()\n\n# Calculate the overall average\noverall_average_quantity = average_quantity_per_customer.mean()\n\nprint(overall_average_quantity)",
    "database": "sales",
    "choices": [
      "476.4536750116713",
      "488.05143371324255",
      "501.189158923379",
      "469.76018577834725"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many unique products have been sold based on the sales data?",
    "code": "# Use sales data to find out the number of unique products sold\nunique_products_sold = Sales['ProductID'].nunique()\n\nprint(unique_products_sold)",
    "database": "sales",
    "choices": [
      "392",
      "474",
      "258",
      "320"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "Which customer made the most purchases in terms of quantity?",
    "code": "# Use sales data to determine the customer with the highest total quantity of purchases\ncustomer_purchases = Sales.groupby('CustomerID')['Quantity'].sum()\ntop_customer_id = customer_purchases.idxmax()\n\ntop_customer_name = Customers.loc[Customers['CustomerID'] == top_customer_id, 'FirstName'].values[0]\n\nprint(top_customer_name)",
    "database": "sales",
    "choices": [
      "Kristi",
      "Seth",
      "Sheena",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "Which customer made the largest purchase in terms of total price of products bought?",
    "code": "# Merge Sales and Products to get the price information on each sale\nsales_products = Sales.merge(Products, on=\"ProductID\")\n\n# Calculate total price per sale\nsales_products[\"TotalPrice\"] = sales_products[\"Quantity\"] * sales_products[\"Price\"]\n\n# Find the Sale with the largest total price\nlargest_purchase = sales_products.groupby(\"CustomerID\")[\"TotalPrice\"].sum().idxmax()\n\n# Get the customer name\ncustomer_name = Customers.loc[Customers[\"CustomerID\"] == largest_purchase, [\"FirstName\", \"LastName\"]].iloc[0]\ncustomer_full_name = f\"{customer_name['FirstName']} {customer_name['LastName']}\"\nprint(customer_full_name)",
    "database": "sales",
    "choices": [
      "Darryl Hu",
      "Samantha Taylor",
      "Dalton Coleman",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total revenue generated by each sales person?",
    "code": "# Merge Sales and Products to get the price information on each sale\nsales_products = Sales.merge(Products, on=\"ProductID\")\n\n# Calculate the total revenue per salesperson\nsales_products[\"TotalPrice\"] = sales_products[\"Quantity\"] * sales_products[\"Price\"]\ntotal_revenue_per_salesperson = sales_products.groupby(\"SalesPersonID\")[\"TotalPrice\"].sum()\n\n# Convert to string for printing\nrevenue_str = \"; \".join([f\"SalesPersonID {idx}: ${total_revenue:.2f}\" for idx, total_revenue in total_revenue_per_salesperson.items()])\nprint(revenue_str)",
    "database": "sales",
    "choices": [
      "SalesPersonID 1: $7197590.64; SalesPersonID 2: $8011428.46; SalesPersonID 3: $3802206.65; SalesPersonID 4: $1886933.80; SalesPersonID 5: $4633602.13; SalesPersonID 6: $5059922.28; SalesPersonID 7: $7398318.35; SalesPersonID 8: $13435417.92; SalesPersonID 9: $8326637.75; SalesPersonID 10: $3802958.31; SalesPersonID 11: $3685801.14; SalesPersonID 12: $9468963.61; SalesPersonID 13: $9214553.47; SalesPersonID 14: $6106278.92; SalesPersonID 15: $1628691.41; SalesPersonID 16: $5543106.08; SalesPersonID 17: $3988156.42; SalesPersonID 18: $3731169.92; SalesPersonID 19: $9731107.49; SalesPersonID 20: $7069697.20; SalesPersonID 21: $4218314.07; SalesPersonID 22: $4633041.84; SalesPersonID 23: $3968692.29",
      "SalesPersonID 1: $39501949.58; SalesPersonID 2: $21694693.56; SalesPersonID 3: $10563680.81; SalesPersonID 4: $13519799.06; SalesPersonID 5: $21170788.93; SalesPersonID 6: $27040409.42; SalesPersonID 7: $32320217.67; SalesPersonID 8: $48605597.24; SalesPersonID 9: $16466648.61; SalesPersonID 10: $20276017.44; SalesPersonID 11: $17427830.32; SalesPersonID 12: $29434830.25; SalesPersonID 13: $27438438.03; SalesPersonID 14: $30923890.73; SalesPersonID 15: $24973617.35; SalesPersonID 16: $20671025.20; SalesPersonID 17: $19021823.04; SalesPersonID 18: $19523996.84; SalesPersonID 19: $32565865.41; SalesPersonID 20: $22913973.70; SalesPersonID 21: $26420099.57; SalesPersonID 22: $22228907.79; SalesPersonID 23: $12364466.88",
      "SalesPersonID 1: $112673279.30; SalesPersonID 2: $94645001.43; SalesPersonID 3: $29303862.44; SalesPersonID 4: $54406853.45; SalesPersonID 5: $49754911.01; SalesPersonID 6: $83097547.89; SalesPersonID 7: $99312877.98; SalesPersonID 8: $125358028.37; SalesPersonID 9: $68828405.72; SalesPersonID 10: $63348490.73; SalesPersonID 11: $58263840.62; SalesPersonID 12: $67341131.09; SalesPersonID 13: $89518686.28; SalesPersonID 14: $111141747.36; SalesPersonID 15: $98403114.34; SalesPersonID 16: $60076464.10; SalesPersonID 17: $63944847.15; SalesPersonID 18: $65085288.47; SalesPersonID 19: $81996512.84; SalesPersonID 20: $61770367.04; SalesPersonID 21: $68267622.47; SalesPersonID 22: $54162587.43; SalesPersonID 23: $28438760.01",
      "SalesPersonID 1: $12425347.49; SalesPersonID 2: $10437309.32; SalesPersonID 3: $5063362.29; SalesPersonID 4: $6414700.31; SalesPersonID 5: $9051924.59; SalesPersonID 6: $11584053.54; SalesPersonID 7: $16293199.02; SalesPersonID 8: $28342161.56; SalesPersonID 9: $10256232.68; SalesPersonID 10: $11175011.83; SalesPersonID 11: $6923338.79; SalesPersonID 12: $14609446.36; SalesPersonID 13: $14699758.66; SalesPersonID 14: $13805050.34; SalesPersonID 15: $11498029.88; SalesPersonID 16: $11264161.64; SalesPersonID 17: $7820064.04; SalesPersonID 18: $10047420.45; SalesPersonID 19: $13488369.02; SalesPersonID 20: $14277138.10; SalesPersonID 21: $15530640.82; SalesPersonID 22: $12374225.76; SalesPersonID 23: $6804806.10"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "Which customer has purchased the most different products?",
    "code": "# Calculate the number of unique products purchased by each customer\nunique_products_per_customer = Sales.groupby(\"CustomerID\")[\"ProductID\"].nunique()\n\n# Determine the customer with the most unique products purchased\nmost_diverse_customer_id = unique_products_per_customer.idxmax()\n\n# Get the customer's name\ncustomer_name = Customers.loc[Customers[\"CustomerID\"] == most_diverse_customer_id, [\"FirstName\", \"LastName\"]].iloc[0]\ncustomer_full_name = f\"{customer_name['FirstName']} {customer_name['LastName']}\"\nprint(customer_full_name)",
    "database": "sales",
    "choices": [
      "Alvin She",
      "Andre Arun",
      "Katherine King",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "Which work has the earliest scene, considering both the chapters and works tables?",
    "code": "earliest_scene_work_id = chapters.loc[chapters['Act'].idxmin()]['work_id']\nearliest_scene_work_title = works.loc[works['id'] == earliest_scene_work_id, 'Title'].values[0]\nprint(earliest_scene_work_title)",
    "database": "shakespeare",
    "choices": [
      "Taming of the Shrew",
      "As You Like It",
      "Twelfth Night",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of paragraphs spoken by characters in comedies, using the works, chapters, and paragraphs tables?",
    "code": "works_comedy = works[works['GenreType'] == 'Comedy']\ncomedy_chapters = chapters[chapters['work_id'].isin(works_comedy['id'])]\ncomedy_paragraphs_count = paragraphs[paragraphs['chapter_id'].isin(comedy_chapters['id'])].shape[0]\nprint(comedy_paragraphs_count)",
    "database": "shakespeare",
    "choices": [
      "539",
      "81",
      "167",
      "44"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many chapters are associated with the comedy genre in the works table?",
    "code": "comedy_work_ids = works[works['GenreType'] == 'Comedy']['id']\ncomedy_chapters_count = chapters[chapters['work_id'].isin(comedy_work_ids)].shape[0]\nprint(comedy_chapters_count)",
    "database": "shakespeare",
    "choices": [
      "99",
      "36",
      "60",
      "170"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many unique characters appear in works with a title starting with 'T'?",
    "code": "work_ids_with_t = works[works['Title'].str.startswith('T')]['id']\nchapter_ids_with_t = chapters[chapters['work_id'].isin(work_ids_with_t)]['id']\nunique_characters_t = paragraphs[paragraphs['chapter_id'].isin(chapter_ids_with_t)]['character_id'].nunique()\nprint(unique_characters_t)",
    "database": "shakespeare",
    "choices": [
      "104",
      "28",
      "58",
      "18"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many paragraphs contain stage directions in works authored before 1600?",
    "code": "works_before_1600 = works[works['Date'] < 1600]['id']\nchapters_before_1600 = chapters[chapters['work_id'].isin(works_before_1600)]['id']\nstage_directions_id = characters[characters['CharName'] == '(stage directions)']['id'].values[0]\nstage_direction_paragraphs_count = paragraphs[\n    (paragraphs['chapter_id'].isin(chapters_before_1600)) & \n    (paragraphs['character_id'] == stage_directions_id)].shape[0]\nprint(stage_direction_paragraphs_count)",
    "database": "shakespeare",
    "choices": [
      "99",
      "38",
      "25",
      "10"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the name of the work with the most chapters featuring stage directions?",
    "code": "stage_directions_id = characters[characters['CharName'] == '(stage directions)']['id'].values[0]\n\nchapters_with_stage_directions = paragraphs[paragraphs['character_id'] == stage_directions_id]['chapter_id'].unique()\nworks_with_stage_directions = chapters[chapters['id'].isin(chapters_with_stage_directions)]['work_id']\n\nmost_stage_directions_work_id = works_with_stage_directions.value_counts().idxmax()\nmost_stage_directions_work_title = works[works['id'] == most_stage_directions_work_id]['Title'].values[0]\n\nprint(most_stage_directions_work_title)",
    "database": "shakespeare",
    "choices": [
      "Antony and Cleopatra",
      "Henry VI, Part II",
      "Richard III",
      "Twelfth Night"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many different characters are there in the work \"Cymbeline\"?",
    "code": "cymbeline_work_id = works.loc[works['Title'] == \"Cymbeline\", 'id'].iloc[0]\ncymbeline_chapters = chapters.loc[chapters['work_id'] == cymbeline_work_id, 'id']\ncymbeline_paragraphs = paragraphs[paragraphs['chapter_id'].isin(cymbeline_chapters)]\ndistinct_characters = cymbeline_paragraphs['character_id'].nunique()\nprint(distinct_characters)",
    "database": "shakespeare",
    "choices": [
      "4",
      "10",
      "5",
      "15"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "Which work features the most acts?",
    "code": "work_act_counts = chapters.groupby('work_id')['Act'].nunique()\nmost_acts_work_id = work_act_counts.idxmax()\nmost_acts_work_title = works.loc[works['id'] == most_acts_work_id, 'Title'].iloc[0]\nprint(most_acts_work_title)",
    "database": "shakespeare",
    "choices": [
      "Hamlet",
      "All's Well That Ends Well",
      "Taming of the Shrew",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the average Klout score for tweets from users in California?",
    "code": "# Merge twitter and location on LocationID\ntweet_location = twitter.merge(location, on='LocationID')\n# Filter for California\ncalifornia_tweets = tweet_location[tweet_location['StateCode'] == 'US-CA']\n# Calculate the average Klout score\naverage_klout = california_tweets['Klout'].mean()\n\n# Print average Klout score\nprint(average_klout)",
    "database": "social_media",
    "choices": [
      "43.90625",
      "43.205882352941174",
      "43.0",
      "42.93333333333333"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the average reach of tweets from users located in Germany?",
    "code": "germany_location_ids = location[location['Country'] == 'Germany']['LocationID']\ngermany_users = twitter[twitter['LocationID'].isin(germany_location_ids)]\naverage_reach_germany = germany_users['Reach'].mean()\nprint(average_reach_germany)",
    "database": "social_media",
    "choices": [
      "564.5",
      "843.1428571428571",
      "1011.7333333333333",
      "1409.357142857143"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many tweets do male users have from the United States?",
    "code": "us_location_ids = location[location['Country'] == 'United States']['LocationID']\nmale_users = user[user['Gender'] == 'Male']['UserID']\nus_male_tweets = twitter[(twitter['LocationID'].isin(us_location_ids)) & (twitter['UserID'].isin(male_users))]\nmale_user_tweet_count_us = len(us_male_tweets)\nprint(male_user_tweet_count_us)",
    "database": "social_media",
    "choices": [
      "12",
      "87",
      "185",
      "35"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of tweets from female users in the United Kingdom?",
    "code": "uk_location_ids = location[location['Country'] == 'United Kingdom']['LocationID']\nfemale_users = user[user['Gender'] == 'Female']['UserID']\nuk_female_tweets = twitter[(twitter['LocationID'].isin(uk_location_ids)) & (twitter['UserID'].isin(female_users))]\nfemale_tweet_count_uk = len(uk_female_tweets)\nprint(female_tweet_count_uk)",
    "database": "social_media",
    "choices": [
      "5",
      "6",
      "9",
      "4"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of unique users who tweeted in California?",
    "code": "california_ids = location[location['State'] == 'California']['LocationID']\nunique_users_in_california = twitter[twitter['LocationID'].isin(california_ids)]['UserID'].nunique()\nprint(unique_users_in_california)",
    "database": "social_media",
    "choices": [
      "14",
      "33",
      "86",
      "45"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many retweets originated from France?",
    "code": "france_id = location[location['Country'] == 'France']['LocationID'].iloc[0]\nretweets_from_france = twitter[(twitter['LocationID'] == france_id) & (twitter['IsReshare'] == 'TRUE')]['TweetID'].count()\nprint(retweets_from_france)",
    "database": "social_media",
    "choices": [
      "4",
      "8",
      "15",
      "2"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the total reach of tweets in the English language from Canada?",
    "code": "canada_ids = location[location['Country'] == 'Canada']['LocationID']\nenglish_tweets_from_canada = twitter[(twitter['LocationID'].isin(canada_ids)) & (twitter['Lang'] == 'en')]\ntotal_reach_english_canada = english_tweets_from_canada['Reach'].sum()\nprint(total_reach_english_canada)",
    "database": "social_media",
    "choices": [
      "7992",
      "4022",
      "4468",
      "6196"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "Which state had the lowest average Twitter sentiment score for tweets made in English?",
    "code": "merged_data = twitter.merge(location, on='LocationID')\nenglish_tweets = merged_data[merged_data['Lang'] == 'en']\navg_sentiment_by_state = english_tweets.groupby('State')['Sentiment'].mean()\nlowest_sentiment_state = avg_sentiment_by_state.idxmin()\nprint(lowest_sentiment_state)",
    "database": "social_media",
    "choices": [
      "Provence-Alpes-Cote d'Azur",
      "Tennessee",
      "Groningen",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "Which city has the highest average sentiment score for tweets?",
    "code": "merged_data = twitter.merge(location, on='LocationID')\ncity_sentiment_stats = merged_data.groupby('City')['Sentiment'].mean()\ncity_with_highest_avg_sentiment = city_sentiment_stats.idxmax()\nprint(city_with_highest_avg_sentiment)",
    "database": "social_media",
    "choices": [
      "Crowthorne",
      "Boston",
      "Alburtis",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "How many unique users have tweeted from Greater London?",
    "code": "greater_london_location_id = location[location['City'] == 'London']['LocationID'].values[0]\nunique_users_greater_london = twitter[twitter['LocationID'] == greater_london_location_id]['UserID'].nunique()\nprint(unique_users_greater_london)",
    "database": "social_media",
    "choices": [
      "39",
      "16",
      "6",
      "9"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many unique GEOIDs are associated with customers who have not responded to any mailings?",
    "code": "all_responded_refids = Mailings1_2[Mailings1_2['RESPONSE'] == 'true']['REFID'].unique()\nnon_responded_customers = Customers[~Customers['ID'].isin(all_responded_refids)]\nunique_geoids_non_response = non_responded_customers['GEOID'].nunique()\nprint(unique_geoids_non_response)",
    "database": "software_company",
    "choices": [
      "77",
      "64",
      "32",
      "52"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many distinct occupations are represented among customers from the same geographical areas as in the Demog table?",
    "code": "geoid_set = set(Demog['GEOID'])\ndistinct_occupations = Customers[Customers['GEOID'].isin(geoid_set)]['OCCUPATION'].nunique()\nprint(distinct_occupations)",
    "database": "software_company",
    "choices": [
      "5",
      "6",
      "7",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of responses labeled as 'false' in both Mailings1_2 and mailings3?",
    "code": "total_false_responses = mailings3[mailings3['RESPONSE'] == 'false'].shape[0] + Mailings1_2[Mailings1_2['RESPONSE'] == 'false'].shape[0]\nprint(total_false_responses)",
    "database": "software_company",
    "choices": [
      "118",
      "723",
      "362",
      "1937"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average income of inhabitants who have received a mailing?",
    "code": "received_mail_ids = set(Mailings1_2['REFID']).union(set(mailings3['REFID']))\nlinked_customers = Customers[Customers['ID'].isin(received_mail_ids)]\naverage_income = Demog[Demog['GEOID'].isin(linked_customers['GEOID'])]['INCOME_K'].mean()\nprint(average_income)",
    "database": "software_company",
    "choices": [
      "2450.0653030303024",
      "2451.4325641025634",
      "2393.3424242424244",
      "2430.878269230769"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "How many unique GEOIDs from the Customers table appear in the Demog table?",
    "code": "unique_geoids_customers = set(Customers['GEOID'])\nunique_geoids_demog = set(Demog['GEOID'])\nunique_geoids_in_both = len(unique_geoids_customers.intersection(unique_geoids_demog))\nprint(unique_geoids_in_both)",
    "database": "software_company",
    "choices": [
      "52",
      "33",
      "78",
      "66"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of male customers above the age of 50 who have received mailings recorded in both Mailings1_2 and mailings3?",
    "code": "combined_mailings_ids = set(Mailings1_2['REFID']).union(set(mailings3['REFID']))\nmale_customers_above_50_count = Customers[(Customers['SEX'] == 'Male') & (Customers['age'] > 50) & (Customers['ID'].isin(combined_mailings_ids))].shape[0]\nprint(male_customers_above_50_count)",
    "database": "software_company",
    "choices": [
      "100",
      "51",
      "258",
      "17"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of customers who responded to any mailing positively?",
    "code": "positive_responses_mailings1_2 = Mailings1_2[Mailings1_2['RESPONSE'] == 'true']\npositive_responses = positive_responses_mailings1_2['REFID'].nunique()\nprint(positive_responses)",
    "database": "software_company",
    "choices": [
      "65",
      "15",
      "7",
      "30"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the average age of customers who received mailings in Mailings1_2 and belong to regions where average household income is above 2500 in Demog?",
    "code": "# Get the GEOIDs where the average household income is above 2500.\nhigh_income_geoids = Demog[Demog['INCOME_K'] > 2500]['GEOID']\n\n# Get the IDs of customers who belong to these high-income GEOIDs.\nhigh_income_customers = Customers[Customers['GEOID'].isin(high_income_geoids)]\n\n# Get the REFIDs of these customers who received mailings in Mailings1_2.\nreceived_mailings = Mailings1_2['REFID'].isin(high_income_customers['ID'])\n\n# Calculate the average age of these customers.\naverage_age = high_income_customers[high_income_customers['ID'].isin(Mailings1_2[received_mailings]['REFID'])]['age'].mean()\nprint(average_age)",
    "database": "software_company",
    "choices": [
      "48.83830845771144",
      "54.92",
      "50.208333333333336",
      "51.95945945945946"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 3,
      "16k": 1
    }
  },
  {
    "question": "What is the average sales per order for the \"West\" region?",
    "code": "# Filter west_superstore for West region data\nwest_region_data = west_superstore[west_superstore['Region'] == 'West']\n\n# Calculate average sales per order\navg_sales_per_order = west_region_data['Sales'].mean()\n\nprint(avg_sales_per_order)",
    "database": "superstore",
    "choices": [
      "258.7466875",
      "213.9150833333333",
      "256.6158541666667",
      "199.19208333333336"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the total sales amount for products in the \"Binders\" sub-category across all regions?",
    "code": "central_binders_sales = central_superstore.merge(product, on='Product ID')\neast_binders_sales = east_superstore.merge(product, on='Product ID')\nsouth_binders_sales = south_superstore.merge(product, on='Product ID')\nwest_binders_sales = west_superstore.merge(product, on='Product ID')\n\ntotal_binders_sales = (central_binders_sales.query('`Sub-Category` == \"Binders\"')['Sales'].sum() +\n                       east_binders_sales.query('`Sub-Category` == \"Binders\"')['Sales'].sum() +\n                       south_binders_sales.query('`Sub-Category` == \"Binders\"')['Sales'].sum() +\n                       west_binders_sales.query('`Sub-Category` == \"Binders\"')['Sales'].sum())\n\nprint(total_binders_sales)",
    "database": "superstore",
    "choices": [
      "22349.661999999997",
      "53880.70199999999",
      "159.15",
      "143.62199999999999"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "How many unique customers have ordered products in both the Central and West regions?",
    "code": "central_customers = set(central_superstore['Customer ID'].unique())\nwest_customers = set(west_superstore['Customer ID'].unique())\nunique_customers_central_west = len(central_customers.intersection(west_customers))\nprint(unique_customers_central_west)",
    "database": "superstore",
    "choices": [
      "48",
      "1",
      "0",
      "9"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the total sales revenue for the product \"Contract Clock, 14\", Brown\" across all regions?",
    "code": "# Filter the product table for the \"Contract Clock, 14\", Brown\" product\nproduct_filter = product[product['Product Name'] == 'Contract Clock, 14\", Brown']\n\n# Extract relevant product IDs\nproduct_ids = product_filter['Product ID'].unique()\n\n# Sum the sales from all regional superstore tables for these product IDs\ntotal_sales = 0\nfor superstore in [central_superstore, east_superstore, south_superstore, west_superstore]:\n    total_sales += superstore[superstore['Product ID'].isin(product_ids)]['Sales'].sum()\n\nprint(total_sales)",
    "database": "superstore",
    "choices": [
      "105.504",
      "8.792",
      "96.712",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the average discount provided for 'Phones' sub-category products across all regions?",
    "code": "# Filter the product table for Phones sub-category\nphones_product_ids = product[product['Sub-Category'] == 'Phones']['Product ID'].unique()\n\n# Calculate total discount and count from all superstore tables for these product IDs\ntotal_discount = 0\ntotal_count = 0\nfor superstore in [central_superstore, east_superstore, south_superstore, west_superstore]:\n    filtered_data = superstore[superstore['Product ID'].isin(phones_product_ids)]\n    total_discount += filtered_data['Discount'].sum()\n    total_count += filtered_data['Discount'].count()\n\n# Calculate average discount\naverage_discount = total_discount / total_count if total_count > 0 else 0\n\nprint(average_discount)",
    "database": "superstore",
    "choices": [
      "0.12972972972972974",
      "0.14545454545454548",
      "0.1",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 2
    }
  },
  {
    "question": "What is the total cost of claims associated with patients who have documented allergies?",
    "code": "allergy_patients = allergies['PATIENT'].unique()\nclaims_cost = claims[claims['PATIENT'].isin(allergy_patients)]['TOTAL'].sum()\nprint(claims_cost)",
    "database": "synthea",
    "choices": [
      "100",
      "500",
      "1500",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "How many patients have an allergy?",
    "code": "merged_data = allergies.merge(patients, left_on='PATIENT', right_on='patient', how='inner')\nunique_patients_with_allergies = merged_data['PATIENT'].nunique()\nprint(unique_patients_with_allergies)",
    "database": "synthea",
    "choices": [
      "26",
      "79",
      "53",
      "14"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many unique patients have both a claim and an encounter recorded?",
    "code": "claim_patients = claims['PATIENT'].unique()\nencounter_patients = encounters['PATIENT'].unique()\nboth_patients = set(claim_patients).intersection(encounter_patients)\nnum_unique_patients = len(both_patients)\nprint(num_unique_patients)",
    "database": "synthea",
    "choices": [
      "5",
      "1",
      "14",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 1
    }
  },
  {
    "question": "What is the average prevalence rate of all conditions listed in the all_prevalences table?",
    "code": "average_prevalence_rate = all_prevalences['PREVALENCE RATE'].mean()\nprint(average_prevalence_rate)",
    "database": "synthea",
    "choices": [
      "0.08750000000000001",
      "0.118",
      "0.06966666666666667",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many unique patients have a diagnosis recorded in both the Examination and Laboratory tables?",
    "code": "exam_diagnosis = set(Examination['ID'][Examination['Diagnosis'].notnull()])\nlab_ids = set(Laboratory['ID'])\nunique_patients = len(exam_diagnosis.intersection(lab_ids))\nprint(unique_patients)",
    "database": "thrombosis_prediction",
    "choices": [
      "11",
      "5",
      "1",
      "0"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "What is the most common diagnosis found in both the Examination and Patient tables?",
    "code": "common_diagnosis = Examination.merge(Patient, left_on='ID', right_on='ID', suffixes=('_exam', '_patient'))\nmost_common_diagnosis = common_diagnosis['Diagnosis_exam'].mode()[0]\nprint(most_common_diagnosis)",
    "database": "thrombosis_prediction",
    "choices": [
      "SjS",
      "Aortitis",
      "SLE",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many unique IDs are present across all three tables?",
    "code": "all_ids = set(Patient['ID']).union(set(Examination['ID']), set(Laboratory['ID']))\nunique_ids_count = len(all_ids)\nprint(unique_ids_count)",
    "database": "thrombosis_prediction",
    "choices": [
      "169",
      "551",
      "315",
      "975"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average age of patients at their first examination date?",
    "code": "import pandas as pd\n\n# Merge Patient and Examination on ID to get both birthday and first examination date.\nmerged_data = pd.merge(Patient, Examination, on='ID')\n\n# Convert the Birthday and Examination Date to datetime objects.\nmerged_data['Birthday'] = pd.to_datetime(merged_data['Birthday'])\nmerged_data['Examination Date'] = pd.to_datetime(merged_data['Examination Date'])\n\n# Calculate the age at first examination.\nmerged_data['Age at First Exam'] = (merged_data['Examination Date'] - merged_data['Birthday']).dt.days / 365.25\n\n# Find the average age.\naverage_age = merged_data['Age at First Exam'].mean()\nprint(average_age)",
    "database": "thrombosis_prediction",
    "choices": [
      "33.59644079397673",
      "34.07419575633128",
      "35.0839933509338",
      "36.200319415925165"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "What is the average ANA level for patients diagnosed with SLE according to both Patient and Examination tables?",
    "code": "# Get the IDs of patients diagnosed with SLE in both tables.\nsle_ids_patient = Patient[Patient['Diagnosis'] == 'SLE']['ID']\nsle_ids_examination = Examination[Examination['Diagnosis'] == 'SLE']['ID']\n\n# Combine the IDs from both sources.\nsle_ids = set(sle_ids_patient).union(set(sle_ids_examination))\n\n# Filter the Examination table for these IDs.\nsle_examinations = Examination[Examination['ID'].isin(sle_ids)]\n\n# Calculate the average ANA level.\naverage_ana = sle_examinations['ANA'].dropna().mean()\nprint(average_ana)",
    "database": "thrombosis_prediction",
    "choices": [
      "757.6666666666666",
      "650.3333333333334",
      "907.6923076923077",
      "827.4766355140187"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "How many unique diagnoses are there in the Examination table for patients who are recorded in the Patient table?",
    "code": "merged_data = Examination.merge(Patient, on='ID', how='inner')\nunique_diagnoses = merged_data['Diagnosis_x'].nunique()\nprint(unique_diagnoses)",
    "database": "thrombosis_prediction",
    "choices": [
      "37",
      "12",
      "5",
      "19"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 1,
      "16k": 2
    }
  },
  {
    "question": "How many patients have a recorded aCL IgG value and a corresponding record in the Laboratory table?",
    "code": "examination_with_acl_igg = Examination[Examination['aCL IgG'].notna()]['ID']\nunique_patients_with_lab_records = Laboratory[Laboratory['ID'].isin(examination_with_acl_igg.unique())]['ID'].nunique()\nprint(unique_patients_with_lab_records)",
    "database": "thrombosis_prediction",
    "choices": [
      "1",
      "0",
      "12",
      "5"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of examinations conducted on female patients?",
    "code": "female_patient_ids = Patient[Patient['SEX'] == 'F']['ID']\ntotal_examinations_of_females = Examination[Examination['ID'].isin(female_patient_ids)].shape[0]\nprint(total_examinations_of_females)",
    "database": "thrombosis_prediction",
    "choices": [
      "60",
      "5",
      "13",
      "28"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 3,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of unique patients who have undergone at least one examination?",
    "code": "unique_patient_ids_examination = Examination['ID'].dropna().astype(int).unique()\nunique_patient_count = len(set(Patient['ID']).intersection(unique_patient_ids_examination))\nprint(unique_patient_count)",
    "database": "thrombosis_prediction",
    "choices": [
      "7",
      "66",
      "31",
      "15"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many examinations include a diagnosis of SLE?",
    "code": "sle_examinations_count = Examination[Examination['Diagnosis'].str.contains('SLE', na=False)].shape[0]\nprint(sle_examinations_count)",
    "database": "thrombosis_prediction",
    "choices": [
      "19",
      "83",
      "36",
      "181"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique diagnoses are recorded in the Examination table?",
    "code": "unique_diagnoses_count = Examination['Diagnosis'].str.split(', ').explode().nunique(dropna=True)\nprint(unique_diagnoses_count)",
    "database": "thrombosis_prediction",
    "choices": [
      "49",
      "26",
      "81",
      "124"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "How many atoms are part of molecules labeled with a '+'?",
    "code": "# Merge atom and molecule on molecule_id, filter for label '+', then count atoms\npositive_molecules = molecule[molecule['label'] == '+']\npositive_atoms = atom[atom['molecule_id'].isin(positive_molecules['molecule_id'])]\nnum_positive_atoms = len(positive_atoms)\nprint(num_positive_atoms)",
    "database": "toxicology",
    "choices": [
      "151",
      "574",
      "81",
      "285"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the number of unique elements found in molecules labeled with a '-'?",
    "code": "# Merge atom and molecule on molecule_id, filter for label '-', then find unique elements\nnegative_molecules = molecule[molecule['label'] == '-']\nnegative_atoms = atom[atom['molecule_id'].isin(negative_molecules['molecule_id'])]\nunique_elements = negative_atoms['element'].nunique()\nprint(unique_elements)",
    "database": "toxicology",
    "choices": [
      "7",
      "8",
      "6",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "How many bonds connect to atoms that are part of molecules labeled with a '+'?",
    "code": "# Merge connected with atom, filter for atoms in molecules labeled '+', then count bonds\npositive_atoms = atom[atom['molecule_id'].isin(molecule[molecule['label'] == '+']['molecule_id'])]\nconnected_positive_atoms = connected[connected['atom_id'].isin(positive_atoms['atom_id']) | connected['atom_id2'].isin(positive_atoms['atom_id'])]\nnum_positive_bonds = connected_positive_atoms['bond_id'].nunique()\nprint(num_positive_bonds)",
    "database": "toxicology",
    "choices": [
      "82",
      "592",
      "157",
      "292"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many positive labeled molecules contain carbon atoms?",
    "code": "positive_molecules = molecule[molecule['label'] == '+']['molecule_id']\npositive_carbon_atoms = atom[(atom['element'] == 'c') & (atom['molecule_id'].isin(positive_molecules))]\nnumber_of_positive_carbon_molecules = len(positive_carbon_atoms['molecule_id'].unique())\nprint(number_of_positive_carbon_molecules)",
    "database": "toxicology",
    "choices": [
      "125",
      "94",
      "39",
      "64"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 3,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of bonds in molecules that contain nitrogen atoms?",
    "code": "nitrogen_atoms = atom[atom['element'] == 'n']['molecule_id'].unique()\nbonds_in_nitrogen_molecules = bond[bond['molecule_id'].isin(nitrogen_atoms)]\ntotal_bonds_in_nitrogen_molecules = len(bonds_in_nitrogen_molecules)\nprint(total_bonds_in_nitrogen_molecules)",
    "database": "toxicology",
    "choices": [
      "26",
      "151",
      "66",
      "505"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many molecules have both carbon and hydrogen atoms?",
    "code": "carbon_molecule_ids = atom[atom['element'] == 'c']['molecule_id'].unique()\nhydrogen_molecule_ids = atom[atom['element'] == 'h']['molecule_id'].unique()\nmolecules_with_both_c_and_h = set(carbon_molecule_ids).intersection(set(hydrogen_molecule_ids))\nnumber_of_molecules_with_both_c_and_h = len(molecules_with_both_c_and_h)\nprint(number_of_molecules_with_both_c_and_h)",
    "database": "toxicology",
    "choices": [
      "216",
      "51",
      "122",
      "17"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many unique elements are present across all molecules?",
    "code": "unique_elements = len(atom['element'].unique())\nprint(unique_elements)",
    "database": "toxicology",
    "choices": [
      "9",
      "11",
      "6",
      "10"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 3,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the total number of atoms connected through bonds in the `connected` table?",
    "code": "total_connected_atoms = len(connected['atom_id'].unique())\nprint(total_connected_atoms)",
    "database": "toxicology",
    "choices": [
      "1920",
      "506",
      "984",
      "256"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 2,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many distinct molecules contain carbon (element 'c') in their structure?",
    "code": "molecules_with_c = len(atom[atom['element'] == 'c']['molecule_id'].unique())\nprint(molecules_with_c)",
    "database": "toxicology",
    "choices": [
      "201",
      "128",
      "291",
      "370"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 2,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the total number of molecules with at least one bond of bond_type '-'?",
    "code": "molecules_with_bond = len(bond[bond['bond_type'] == '-']['molecule_id'].unique())\nprint(molecules_with_bond)",
    "database": "toxicology",
    "choices": [
      "161",
      "405",
      "340",
      "248"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "How many molecules have atoms with the element 'h'?",
    "code": "molecule_ids_with_h = atom[atom['element'] == 'h']['molecule_id'].unique()\nnum_molecules_with_h = molecule[molecule['molecule_id'].isin(molecule_ids_with_h)].shape[0]\nprint(num_molecules_with_h)",
    "database": "toxicology",
    "choices": [
      "47",
      "79",
      "187",
      "133"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of bonds associated with the molecule having the most atoms?",
    "code": "atom_count_per_molecule = atom['molecule_id'].value_counts()\nmolecule_with_most_atoms = atom_count_per_molecule.idxmax()\nnum_bonds_for_molecule = bond[bond['molecule_id'] == molecule_with_most_atoms].shape[0]\nprint(num_bonds_for_molecule)",
    "database": "toxicology",
    "choices": [
      "5",
      "11",
      "6",
      "26"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many unique element types exist in molecules labeled with '+'?",
    "code": "# Merge molecule and atom tables to filter for atoms in positively labeled molecules\npositive_atoms = atom.merge(molecule[molecule['label'] == '+'], on='molecule_id')\n# Get unique elements\nunique_elements = positive_atoms['element'].nunique()\nprint(unique_elements)",
    "database": "toxicology",
    "choices": [
      "6",
      "9",
      "10",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 2,
      "32k": 1,
      "16k": 0
    }
  },
  {
    "question": "What is the total number of bonds in molecules labeled with '-'?",
    "code": "# Merge molecule and bond tables to filter for bonds in negatively labeled molecules\nnegative_bonds = bond.merge(molecule[molecule['label'] == '-'], on='molecule_id')\n# Count the number of bonds\ntotal_negative_bonds = len(negative_bonds)\nprint(total_negative_bonds)",
    "database": "toxicology",
    "choices": [
      "226",
      "870",
      "448",
      "116"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "How many molecules contain at least one 'h' element?",
    "code": "# Filter atoms for 'h' elements and find corresponding molecules\nmolecules_with_h = atom[atom['element'] == 'h']['molecule_id'].nunique()\nprint(molecules_with_h)",
    "database": "toxicology",
    "choices": [
      "168",
      "97",
      "240",
      "53"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "What is the average student-staff ratio for universities in the United States of America in 2014?",
    "code": "# Merge the university and country tables to identify universities in the United States\nuniversities_usa = university[university['country_id'] == country[country['country_name'] == \"United States of America\"].iloc[0]['id']]\n\n# Filter the university_year table for records in 2014 related to these universities\nuniversity_ids_usa_2014 = universities_usa['id']\nuniversity_year_usa_2014 = university_year[(university_year['year'] == 2014) & (university_year['university_id'].isin(university_ids_usa_2014))]\n\n# Calculate the average student-staff ratio\naverage_ratio = university_year_usa_2014['student_staff_ratio'].mean()\nprint(average_ratio)",
    "database": "university",
    "choices": [
      "12.203896103896104",
      "11.903389830508473",
      "11.257142857142858",
      "9.92"
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "How many universities are ranked in the Total CWUR criterion in 2014?",
    "code": "# Identify the ranking criteria ID for Total CWUR in CWUR (id=3)\ncriteria_id = ranking_criteria[(ranking_criteria['criteria_name'] == \"Total CWUR\") & (ranking_criteria['ranking_system_id'] == 3)].iloc[0]['id']\n\n# Filter the university_ranking_year table for 2014 based on the criteria ID\nrankings_2014 = university_ranking_year[(university_ranking_year['year'] == 2014) & (university_ranking_year['ranking_criteria_id'] == criteria_id)]\n\n# Count the number of unique universities\nnum_universities = rankings_2014['university_id'].nunique()\nprint(num_universities)",
    "database": "university",
    "choices": [
      "41",
      "19",
      "144",
      "9"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 0,
      "32k": 1,
      "16k": 3
    }
  },
  {
    "question": "How many universities from Canada have data available for the year 2014?",
    "code": "canada_universities = university[university['country_id'] == country[country['country_name'] == 'Canada']['id'].values[0]]\ncanada_universities_2014 = canada_universities.merge(university_year, left_on=\"id\", right_on=\"university_id\")\ncount_canada_2014 = canada_universities_2014[canada_universities_2014['year'] == 2014]['university_id'].nunique()\nprint(count_canada_2014)",
    "database": "university",
    "choices": [
      "1",
      "7",
      "4",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What ratio of universities have \"international students\" as a majority (more than 50%) of their student body in the available dataset?",
    "code": "universities_majority_international = university_year[university_year['pct_international_students'] > 50]\ntotal_universities_year_data = university_year['university_id'].nunique()\nratio_majority_international = len(universities_majority_international['university_id'].unique()) / total_universities_year_data\nprint(ratio_majority_international)",
    "database": "university",
    "choices": [
      "0.0",
      "0.00816326530612245",
      "0.008333333333333333",
      "0.009345794392523364"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 2,
      "32k": 3,
      "16k": 0
    }
  },
  {
    "question": "Which country has the university with the highest score in the Income ranking criteria?",
    "code": "# Merge dataframes to obtain the necessary information\nmerged_df = (university_ranking_year\n             .merge(ranking_criteria, left_on='ranking_criteria_id', right_on='id')\n             .merge(university, left_on='university_id', right_on='id')\n             .merge(country, left_on='country_id', right_on='id'))\n\n# Filter for the 'Income' ranking criteria, then find the highest score\nincome_criteria = merged_df[merged_df['criteria_name'] == 'Income']\nmax_score_row = income_criteria.loc[income_criteria['score'].idxmax()]\n\n# Get the corresponding country name\ncountry_with_highest_score = max_score_row['country_name']\nprint(country_with_highest_score)",
    "database": "university",
    "choices": [
      "Netherlands",
      "United States of America",
      "Germany",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 0,
      "16k": 2
    }
  },
  {
    "question": "What is the average student-to-staff ratio among universities that have a ranking score in Pub criteria?",
    "code": "# Merge dataframes to obtain the necessary information\nmerged_df = (university_ranking_year\n             .merge(ranking_criteria, left_on='ranking_criteria_id', right_on='id')\n             .merge(university, left_on='university_id', right_on='id')\n             .merge(university_year, left_on='university_id', right_on='university_id'))\n\n# Filter for universities with scores in 'Pub' criteria, then calculate the average student-to-staff ratio\npub_criteria = merged_df[merged_df['criteria_name'] == 'Pub']\naverage_ratio = pub_criteria['student_staff_ratio'].mean()\nprint(average_ratio)",
    "database": "university",
    "choices": [
      "13.284920634920635",
      "9.685714285714287",
      "15.661904761904767",
      "12.74760147601476"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "How many universities are from the United Kingdom in the dataset?",
    "code": "# Merge dataframes to obtain the necessary information\nmerged_df = university.merge(country, left_on='country_id', right_on='id')\n\n# Filter for universities located in the United Kingdom and count them\nuk_universities_count = merged_df[merged_df['country_name'] == 'United Kingdom'].shape[0]\nprint(uk_universities_count)",
    "database": "university",
    "choices": [
      "15",
      "35",
      "23",
      "63"
    ],
    "rightIdx": {
      "128k": 3,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "How many universities in the United States were ranked based on the criterion 'Pub' in the year 2010?",
    "code": "# Join `university_ranking_year`, `ranking_criteria`, `university`, and `country` tables to find relevant US universities\npub_criteria_id = ranking_criteria.loc[(ranking_criteria['criteria_name'] == 'Pub') & (ranking_criteria['ranking_system_id'] == 2), 'id'].iloc[0]\nus_universities = university.loc[university['country_id'] == country.loc[country['country_name'] == 'United States of America', 'id'].iloc[0]]\nus_pub_2010 = university_ranking_year.loc[(university_ranking_year['university_id'].isin(us_universities['id'])) & (university_ranking_year['ranking_criteria_id'] == pub_criteria_id) & (university_ranking_year['year'] == 2010)]\n\n# Count the number of universities\nnumber_of_us_universities = us_pub_2010['university_id'].nunique()\nprint(number_of_us_universities)",
    "database": "university",
    "choices": [
      "1",
      "2",
      "6",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 0
    }
  },
  {
    "question": "What is the name of the university with the highest total score in all criteria for the year 2015 across all ranking systems?",
    "code": "# Filter for the year 2015\nrankings_2015 = university_ranking_year[university_ranking_year['year'] == 2015]\n\n# Sum scores by university in 2015\ntotal_scores = rankings_2015.groupby('university_id')['score'].sum()\n\n# Get the university ID with the highest total score\ntop_university_id = total_scores.idxmax()\n\n# Find the name of this university\ntop_university = university[university['id'] == top_university_id]['university_name'].values[0]\nprint(top_university)",
    "database": "university",
    "choices": [
      "Dongguk University",
      "University of Belgrade",
      "University of the Algarve",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  },
  {
    "question": "What is the average release year of games on the PC platform?",
    "code": "# Filter game_platform for the PC platform\npc_games = game_platform[game_platform['platform_id'] == platform[platform['platform_name'] == 'PC']['id'].values[0]]\n\n# Calculate the average release year for games on PC\naverage_release_year = pc_games['release_year'].mean()\n\nprint(average_release_year)",
    "database": "video_games",
    "choices": [
      "2009.2051282051282",
      "2009.2222222222222",
      "2009.6607142857142",
      "2008.8333333333333"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 3,
      "32k": 0,
      "16k": 1
    }
  },
  {
    "question": "What is the average number of sales for all games published by Tecmo Koei across all regions?",
    "code": "tecmo_koei_id = publisher.loc[publisher['publisher_name'] == 'Tecmo Koei', 'id'].values[0]\ntecmo_koei_games = game_publisher[game_publisher['publisher_id'] == tecmo_koei_id]\ntecmo_koei_game_platform_ids = game_platform[game_platform['game_publisher_id'].isin(tecmo_koei_games['id'])]\ntecmo_koei_sales = region_sales[region_sales['game_platform_id'].isin(tecmo_koei_game_platform_ids['id'])]\naverage_sales = tecmo_koei_sales['num_sales'].mean()\nprint(average_sales)",
    "database": "video_games",
    "choices": [
      "0.008181818181818182",
      "0.03529411764705882",
      "0.027567567567567567",
      "0.01"
    ],
    "rightIdx": {
      "128k": 2,
      "64k": 1,
      "32k": 0,
      "16k": 3
    }
  },
  {
    "question": "What is the total number of games released by Tecmo Koei across all platforms?",
    "code": "tecmo_games = game_publisher[game_publisher['publisher_id'] == 506]\ntecmo_game_ids = tecmo_games['game_id']\ntotal_tecmo_games = game_platform[game_platform['game_publisher_id'].isin(tecmo_games['id'])]['game_publisher_id'].nunique()\nprint(total_tecmo_games)",
    "database": "video_games",
    "choices": [
      "17",
      "36",
      "11",
      "4"
    ],
    "rightIdx": {
      "128k": 1,
      "64k": 0,
      "32k": 2,
      "16k": 3
    }
  },
  {
    "question": "Which platform has the most games published by Konami Digital Entertainment?",
    "code": "konami_publishers = game_publisher[game_publisher['publisher_id'] == 282]\nmerged = game_platform.merge(konami_publishers, left_on='game_publisher_id', right_on='id')\nplatform_counts = merged['platform_id'].value_counts()\nmost_games_platform_id = platform_counts.idxmax()\nmost_games_platform = platform.loc[platform['id'] == most_games_platform_id, 'platform_name'].iloc[0]\nprint(most_games_platform)",
    "database": "video_games",
    "choices": [
      "PS2",
      "SNES",
      "DS",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 0,
      "32k": 2,
      "16k": 1
    }
  },
  {
    "question": "Which year had the highest number of releases for the Sports genre?",
    "code": "sports_games = game[game['genre_id'] == 11]  # 11 is the id for Sports in genre\nsports_game_publishers = game_publisher[game_publisher['game_id'].isin(sports_games['id'])]\nsports_releases = game_platform[game_platform['game_publisher_id'].isin(sports_game_publishers['id'])]\nyear_counts = sports_releases['release_year'].value_counts()\nyear_with_most_releases = year_counts.idxmax()\nprint(year_with_most_releases)",
    "database": "video_games",
    "choices": [
      "2007",
      "2010",
      "2002",
      "There is no right choice above."
    ],
    "rightIdx": {
      "128k": 0,
      "64k": 1,
      "32k": 2,
      "16k": 0
    }
  }
]