library(haven)
library(dplyr)
library(tidyr)
# Update this path to point to your data file location
df <- read_dta("entry.dta")

# Convert labelled variables to factors
df <- df %>% mutate(across(where(is.labelled), as_factor))
dim(df) #[1] 39201 participants   551 variables
colnames(df)
# df$b1 # life satisfaction
# df$b2_1 - b2_4 percentage
# df$b3 education
# df$b4 marital status
# df$b5 number of children
# df$b7 house income
# df$b6  the number of people in the household
# df$e1 hours per week working at the last job
# df$e8b    how the last job ended
# df$e11 weeks spent looking for work in the last week
# df$e37 credit card balance
# df$e35 savings account value
vars_needed <- c(
  "b1",      # life satisfaction
  "b2_1", "b2_2", "b2_3", "b2_4",  # percentages
  "b3",      # education
  "b4",      # marital status
  "b5",      # number of children
  "b6",      # household size
  "b7",      # household income
  "e8b",     # how last job ended
  "e11",     # weeks spent looking for work
  "e35",     # savings value
  "e37"      # credit card balance
)

# keep only those columns
df_sub <- df %>% select(all_of(vars_needed))
df_complete <- df_sub %>% drop_na()
dim(df_complete)
head(df_complete)

df_model <- df_complete %>%
  mutate(
    
    # (1) Life satisfaction: discrete 0–3 (3 = most satisfied)
    b1 = as.integer(b1) - 1,
    
    # (2) Highest education level: discrete 0–5
    b3 = as.integer(b3) - 1,
    
    # (3) Marital status: convert to integer first
    b4 = as.integer(b4) - 1,
    
    # (4) Number of children (already discrete)
    b5 = as.integer(b5),
    
    # (5) Household size (already discrete)
    b6 = as.integer(b6),
    
    # (6) Annual household income (continuous)
    b7 = as.numeric(b7),
    
    # (8) How last job ended: convert to integer first
    e8b = as.integer(e8b) - 1,
    
    # (9) Savings value
    e35 = as.numeric(e35),
    
    # (10) Credit card balance (continuous)
    e37 = as.numeric(e37)
  )

# Convert marital_status and job_end_reason to one-hot encoding
# Create one-hot encoding for marital_status (6 categories: 0-5)
marital_factor <- factor(df_model$b4, levels = 0:5)
marital_onehot <- model.matrix(~ marital_factor - 1)
colnames(marital_onehot) <- paste0("marital_status_", 0:5)
# Remove first column to avoid singularity (marital_status_0 is the reference)
marital_onehot <- marital_onehot[, -1, drop = FALSE]

# Create one-hot encoding for job_end_reason (3 categories: 0-2)
job_factor <- factor(df_model$e8b, levels = 0:2)
job_onehot <- model.matrix(~ job_factor - 1)
colnames(job_onehot) <- paste0("job_end_reason_", 0:2)
# Remove first column to avoid singularity (job_end_reason_0 is the reference)
job_onehot <- job_onehot[, -1, drop = FALSE]

# Combine everything
df_model <- df_model %>%
  select(-b4, -e8b) %>%  # Remove original categorical variables
  bind_cols(as.data.frame(marital_onehot)) %>%
  bind_cols(as.data.frame(job_onehot)) %>%
  rename(
    life_satisfaction        = b1,
    education_level          = b3,
    num_children             = b5,
    household_size           = b6,
    household_income         = b7,
    savings                  = e35,
    credit_card_balance      = e37
  )

write.csv(
  df_model,
  file = "data_emotion.csv",
  row.names = FALSE
)

