# WHI data application
# Setup: Large observational study (OS) and RCT data (CT)

library(tidyverse)
library(foreign)
library(modelr)
library(forcats)
library(ggpubr)
library(sjmisc)
library(kableExtra)

## Prepare data

setwd("~/Uni/Forschung/Article/2022 - Multi-CATE/data/WHI_archive")

ct_raw <- read.dta("WHI_CT.dta")
os_raw <- read.dta("WHI_OS.dta")

summary(ct_raw)
summary(os_raw)

# Y: chd_failure (Congenital Heart Disease)
#    time_to_chd
#    syst (systolic blood pressure at last visit)
#    syst_bl ? (diastolic blood pressure baseline)
# T: HT_intervention
# X: age agefbir angina aortican atrialfb bmix_bl booph cvd cardrest cabg cigsday diabtrt dias_bl dvt ethnic hicholrp hip55 hyptpill mi numfalls parity stroke

cov_cols = c('syst_bl', 'age', 'angina', 'aortican', 'atrialfb', 'bmix_bl', 'booph', 'cabg', 'cigsday', 'diabtrt', 'dias_bl', 'dvt', 'ethnic', 'hicholrp', 'mi', 'numfalls', 'parity', 'stroke')

ct <- ct_raw %>%
  mutate(Y = syst, # update outcome here
         T = ifelse(HT_intervention == "Estrogen + Progestin", 1, 0)) %>%
  mutate(numfalls = fct_recode(numfalls,
                               "0" = "None", 
                               "3" = "3 or more time"),
         numfalls = as.numeric(as.character(numfalls)),
         parity = fct_recode(parity,
                             "0" = "Never had term pregnancy",
                             "0" = "Never pregnant", 
                             "5" = "5+"),
         parity = as.numeric(as.character(parity))) %>%
  select(Y, T, all_of(cov_cols)) %>%
 #mutate_if(is.factor, fct_explicit_na, na_level = "Missing") %>%
  drop_na()

ct_num <- model_matrix(ct, Y ~ .) %>%
  select(-'(Intercept)') %>%
  add_column(Y = ct$Y)

names(ct_num) <- make.names(names(ct_num))

os <- os_raw %>%
  mutate(Y = syst, # update outcome here
         T = ifelse(HT_intervention == "Yes", 1, 0)) %>%
  mutate(numfalls = fct_recode(numfalls,
                               "0" = "None", 
                               "3" = "3 or more time"),
         numfalls = as.numeric(as.character(numfalls)),
         parity = fct_recode(parity,
                             "0" = "Never had term pregnancy",
                             "0" = "Never pregnant", 
                             "5" = "5+"),
         parity = as.numeric(as.character(parity))) %>%
  select(Y, T, all_of(cov_cols)) %>%
 #drop_na(dvt, mi, stroke) %>%
 #mutate_if(is.factor, fct_explicit_na, na_level = "Missing") %>%
  drop_na()

os_num <- model_matrix(os, Y ~ .) %>%
  select(-'(Intercept)') %>%
  add_column(Y = os$Y)

names(os_num) <- make.names(names(os_num))

save(ct_num, os_num,
     file = "whi_data.RData")

### Tables and Plots

os_num2 <- os %>%
  to_dummy(ethnic, cigsday) %>%
  bind_cols(os) %>%
  select(-ethnic, -cigsday)

t1 <- os_num2 %>%
  select(Y, T, syst_bl, dias_bl, bmix_bl, age, contains("cigsday"), contains("ethnic")) %>%
  summarise_all(mean)

t2 <- os_num2 %>%
  select(Y, T, syst_bl, dias_bl, bmix_bl, age, contains("cigsday"), contains("ethnic")) %>%
  group_by(T) %>%
  summarise_all(mean)

t <- t(bind_rows(t1, t2))

os_stat <- knitr::kable(t, format = 'latex', digits = 2)
writeLines(os_stat, 'os_stat.tex')

ct_num2 <- ct %>%
  to_dummy(ethnic, cigsday) %>%
  bind_cols(ct) %>%
  select(-ethnic, -cigsday)

t3 <- ct_num2 %>%
  select(Y, T, syst_bl, dias_bl, bmix_bl, age, contains("cigsday"), contains("ethnic")) %>%
  summarise_all(mean)

t4 <- ct_num2 %>%
  select(Y, T, syst_bl, dias_bl, bmix_bl, age, contains("cigsday"), contains("ethnic")) %>%
  group_by(T) %>%
  summarise_all(mean)

t <- t(bind_rows(t3, t4))

ct_stat <- knitr::kable(t, format = 'latex', digits = 2)
writeLines(ct_stat, 'ct_stat.tex')

com <- ct %>% 
  bind_rows(os, .id = "rct") %>% 
  mutate(source = ifelse(rct == 1, "RCT", "OS"),
         Treatment = factor(T),
         ethnic = fct_recode(ethnic,
                             "Am. Indian" = "American Indian",
                             "A/P Islander" = "Asian/Pacific Islander"))
  
p0 <- com %>%
  ggplot(aes(Y, fill = Treatment, group = Treatment)) +
  geom_density(alpha = 0.3) +
  labs(y = "", x = "Systolic blood pressure") +
  facet_grid(cols = vars(source))

p1 <- com %>%
  ggplot(aes(syst_bl, fill = Treatment, group = Treatment)) +
  geom_density(alpha = 0.3) +
  labs(y = "", x = "Systolic blood pressure bl.") +
  theme(legend.position = "none") +
  facet_grid(cols = vars(source))

p2 <- com %>%
  ggplot(aes(dias_bl, fill = Treatment, group = Treatment)) +
  geom_density(alpha = 0.3) +
  labs(y = "", x = "Diastolic blood pressure bl.") +
  theme(legend.position = "none") +
  facet_grid(cols = vars(source))

p3 <- com %>%
  ggplot(aes(bmix_bl, fill = Treatment, group = Treatment)) +
  geom_density(alpha = 0.3) +
  labs(y = "", x = "BMI baseline") +
  theme(legend.position = "none") +
  facet_grid(cols = vars(source))

p4 <- com %>%
  ggplot(aes(age, fill = Treatment, group = Treatment)) +
  geom_density(alpha = 0.3) +
  labs(y = "", x = "Age") +
  theme(legend.position = "none") +
  facet_grid(cols = vars(source))

p5 <- com %>%
  count(source, Treatment, ethnic) %>% 
  group_by(source, Treatment) %>% 
  mutate(Sum = sum(n)) %>% 
  mutate(proportion = n/Sum) %>% 
  ggplot(aes(y = proportion, x = ethnic, fill = Treatment)) +
  geom_col(position = "dodge2") +
  labs(y = "", x = "Ethnicity") +
  facet_grid(cols = vars(source)) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, vjust = 0.5))

p6 <- com %>%
  count(source, Treatment, cigsday) %>% 
  group_by(source, Treatment) %>% 
  mutate(Sum = sum(n)) %>% 
  mutate(proportion = n/Sum) %>% 
  ggplot(aes(y = proportion, x = cigsday, fill = Treatment)) +
  geom_col(position = "dodge2") +
  labs(y = "", x = "Cigarettes per day") +
  facet_grid(cols = vars(source)) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, vjust = 0.5))

ggarrange(p0, p1, p2, p3, p4, p6, p5, ncol = 4, nrow = 2, 
          common.legend = T, 
         # align = "hv",
          legend = "bottom")

ggsave("whi_data.pdf", width = 10, height = 7)
