
# 1. Install & Load Packages
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
#BiocManager::install("TCGAbiolinks")
#BiocManager::install("limma")

library(TCGAbiolinks)
library(limma)
library(magrittr)
library(SummarizedExperiment)
library(dplyr)
library(here)
library(expm)


## Set working directory to directory "./realdata3/"
#setwd("./realdata3/")


################################################################
################################################################
## SKIP DOWN TO LINE 80 TO AVOID DOWNLOADING RAW TCGA DATA AGAIN
################################################################
################################################################

query_luad_pt <- GDCquery(project = "TCGA-LUAD",
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification",
                          workflow.type = "STAR - Counts",
                          sample.type = "Primary Tumor")
query_lusc_pt <- GDCquery(project = "TCGA-LUSC",
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification",
                          workflow.type = "STAR - Counts",
                          sample.type = "Primary Tumor")

################################
################################
### DONWLOADING TCGA-LUAD DATA.
GDCdownload(
  query_luad_pt,
  directory = "~/TCGA_LUAD",
  method = "api",
  files.per.chunk = 10)
TCGA_LUAD_data <- GDCprepare(
  query_luad_pt, 
  directory = "~/TCGA_LUAD")
saveRDS(TCGA_LUAD_data, 
        paste0(getwd(), "/TCGA_LUAD_summary.rds")) 

################################
################################
### DONWLOADING TCGA-LUSC DATA.
GDCdownload(
  query_lusc_pt,
  directory = "~/TCGA_LUSC",
  method = "api",
  files.per.chunk = 10)
TCGA_LUSC_data <- GDCprepare(
  query_lusc_pt, 
  directory = "~/TCGA_LUSC")
saveRDS(TCGA_LUSC_data, 
        paste0(getwd(), "/TCGA_LUSC_summary.rds")) 

rm(query_lusc_pt, query_luad_pt)

################################################################
################################################################
## RUN FROM HERE TO REPLICATE OUR ANALYSIS
################################################################
################################################################

################################
################################
### IMPORT PRE-DOWNLOADED LUSC AND
### LUAD DATA TO ENVIRONMENT.

TCGA_LUAD_data <- readRDS(paste0(getwd(), "/TCGA_LUAD_summary.rds"))
TCGA_LUAD_mat  <- assay(TCGA_LUAD_data)


head(TCGA_LUAD_mat[1:5,1:5])


TCGA_LUSC_data <- readRDS(paste0(getwd(), "/TCGA_LUSC_summary.rds"))
TCGA_LUSC_mat  <- assay(TCGA_LUSC_data)

head(TCGA_LUSC_mat[1:5,1:5])
dim(TCGA_LUSC_mat)



################################
################################
### MATCHING CURRENT GENE NAMES TO 
### GENE-IDs

#BiocManager::install("org.Hs.eg.db")
#BiocManager::install("janitor")
#BiocManager::install("clusterProfiler")
library(org.Hs.eg.db)
library(janitor)
library(clusterProfiler)

TCGA_LUAD_genes <- rownames(TCGA_LUAD_mat) %>%
  tibble::enframe() %>%
  mutate(ENSEMBL = stringr::str_replace(value, "\\.[0-9]+",""))

clusterProfiler::bitr(TCGA_LUAD_genes$ENSEMBL, 
                      fromType = "ENSEMBL",
                      toType = "SYMBOL",
                      OrgDb = org.Hs.eg.db) %>%
  janitor::get_dupes(SYMBOL) %>%
  head()


TCGA_LUAD_gene_map <-clusterProfiler::bitr(
    TCGA_LUAD_genes$ENSEMBL, 
    fromType = "ENSEMBL",
    toType = "SYMBOL",
    OrgDb = org.Hs.eg.db) %>%
  
  distinct(SYMBOL, .keep_all = TRUE)

TCGA_LUAD_gene_map <- TCGA_LUAD_gene_map %>%
  left_join(TCGA_LUAD_genes)

head(TCGA_LUAD_gene_map)

################################
################################
### CHANGING NAMES TO GENE-ID

TCGA_LUSC_mat <- TCGA_LUSC_mat[TCGA_LUAD_gene_map$value, ]
rownames(TCGA_LUSC_mat) <- TCGA_LUAD_gene_map$SYMBOL
TCGA_LUSC_mat[1:5, 1:5]
#TCGA_LUSC_mat$cancer_type <- "LUSC"

TCGA_LUAD_mat <- TCGA_LUAD_mat[TCGA_LUAD_gene_map$value, ]
rownames(TCGA_LUAD_mat) <- TCGA_LUAD_gene_map$SYMBOL
TCGA_LUAD_mat[1:5, 1:5]
#TCGA_LUAD_mat$cancer_type <- "LUAD"

dim(TCGA_LUAD_mat)
dim(TCGA_LUSC_mat)

all.equal(rownames(TCGA_LUAD_mat), rownames(TCGA_LUSC_mat))

################################
################################
## SELECTING GENES RELEVANT TO LUNG
## CANCER VIA MALACARDS DATA.

malacards_data <- read.csv("MalaCardsGenesLungCancer.csv")

length(malacards_data$Symbol)

mc_genes_select <- (intersect(malacards_data$Symbol, rownames(TCGA_LUAD_mat)))

LUAD_mat_red <- TCGA_LUAD_mat[rownames(TCGA_LUAD_mat) %in% mc_genes_select, ]
LUSC_mat_red <- TCGA_LUSC_mat[rownames(TCGA_LUSC_mat) %in% mc_genes_select, ]

dim(LUAD_mat_red)
dim(LUSC_mat_red)

LUAD_mat_t <- t(LUAD_mat_red)
LUSC_mat_t <- t(LUSC_mat_red)

View(LUAD_mat_t)
View(LUSC_mat_t)

dim(LUAD_mat_t)
dim(LUSC_mat_t)






################################
################################
## Clean before saving environment:
rm(
  TCGA_LUSC_data, TCGA_LUAD_data,
  TCGA_LUSC_mat, TCGA_LUAD_mat,
  TCGA_LUAD_genes,
  TCGA_LUAD_gene_map,
  mc_genes_select, malacards_data,
  LUAD_mat_red, LUSC_mat_red)

save.image("W26_Data1.RData")
rm(list = ls())









