## Title: sample_manually.R
## AUthor: Nick Manning
## Date: 4/15/2022

## Purpose: To import bibs of our article as df then
## separate into a shared 20% of articles to review and
## then the rest (40 % each for mc tfl)

# Required: 
## .bib files (from WoS)

# Results in: 
## 2 shared files (1 w 20% mc, 1 with 20% tfl)
## 4 individual files (1 mc, 1 tfl for each, with 40% each)
## 6 total exports:
  ## mc_20, tfl_20, mc_40_nm, mc_40_yl, tfl_40_nm, tfl_40_yl

## results in prescreening data used in screening_titleabs.R, 
##final_screened_results.R, and screen_subtopics.R

######################################

# Load Libraries -------------------------------

rm(list = ls())

library(revtools) #load in bib file as df
library(dplyr) # sample_n

getwd()

# Load in bib files ----------------------------

tfl_all <- read_bibliography("Code/SourceData/wos_tfl_bib.bib")
mc_all <- read_bibliography("Code/SourceData/wos_mc_bib.bib") # only works for BibTex


# create variables with how many papers to select to get 20% and 40%
n_mc_20percent <- round(nrow(mc_all)*0.2)
n_tfl_20percent <- round(nrow(tfl_all)*0.2)

n_mc_40percent <- round(nrow(mc_all)*0.4)
n_tfl_40percent <- round(nrow(tfl_all)*0.4)

# Subset Papers -------------------------------------------


# Metacoupling ----------------

# randomly select 20% of the MC papers 
mc_20 <- sample_n(mc_all, n_mc_20percent) 

# give us the remaining 80% of the papers
mc_rest <- anti_join(mc_all, mc_20)

# randomly sample half of the remaining 80%, or 40% of total
mc_40_nm <- sample_n(mc_rest, n_mc_40percent)

# give the other reviewer the other half of the remaining
mc_40_yl <- anti_join(mc_rest, mc_40_nm)


# Tobler's First Law ----------

#sample 20% of TFL papers 
tfl_20 <- sample_n(tfl_all, n_tfl_20percent)

# give us the remaining 80% of the papers not used above
tfl_rest <- anti_join(tfl_all, tfl_20)

# randomly sample half of the remaining 80%, or 40% of total
tfl_40_nm <- sample_n(tfl_rest, n_tfl_40percent)

# give the other reviewer the other half of the remaining
tfl_40_yl <- anti_join(tfl_rest, tfl_40_nm)



# Export CSV's for review ---------------------------------

# source 
write.csv(mc_all, "Code/SourceData/wos_mc.csv")
write.csv(tfl_all, "Code/SourceData/wos_tfl.csv")

# for NM

write.csv(mc_20, "DerivedData/prescreen/mc_20p_nm.csv")
write.csv(tfl_20, "DerivedData/prescreen/tfl_20p_nm.csv")

write.csv(mc_40_nm, "DerivedData/prescreen/mc_40p_nm.csv")
write.csv(tfl_40_nm, "DerivedData/prescreen/tfl_40p_nm.csv")


# for YL

write.csv(mc_20, "DerivedData/prescreen/mc_20p_yl.csv")
write.csv(tfl_20, "DerivedData/prescreen/tfl_20p_yl.csv")


write.csv(mc_40_yl, "DerivedData/prescreen/mc_40p_yl.csv")
write.csv(tfl_40_yl, "DerivedData/prescreen/tfl_40p_yl.csv")








# # test allocation ------------------------------
# 
# # Metacoupling Papers 
# mc_effort <- allocate_effort(reviewers = c("yl", "nm"),
#                           effort = 0.50, 
#                           proportion_checked = 0.20,
#                           #max_reviewers = 2,
#                           quiet = F)
# 
# mc_result <- distribute_tasks(data = mc_all, 
#                               reviewers = effort,
#                               file_name = "mc20.csv")
# 
# 
# mc_yl <- read.csv("mc20_yl.csv")
# mc_nm <- read.csv("mc20_nm.csv")
# 
# mc_same <- inner_join(yl, nm, by = "title")
# 
# # Toblers First Law Papers 
# tfl_effort <- allocate_effort(reviewers = c("yl", "nm"),
#                              effort = 0.50, 
#                              proportion_checked = 0.20,
#                              #max_reviewers = 2,
#                              quiet = F)
# 
# tfl_result <- distribute_tasks(data = mc_all, 
#                               reviewers = effort,
#                               file_name = "tfl20.csv")
# 
# tfl_yl <- read.csv("mc20_yl.csv")
# tfl_nm <- read.csv("mc20_nm.csv")
# 
# tfl_same <- inner_join(tfl_yl, tfl_nm, by = "title")



### Barplot Comparison of Prelim. Results
### 3/19/22
### Nick Manning

# NOTE: very manual, not ideal.
# I created the csv by copy and pasting the results from the Google Sheet, no
# idea if they are up-to-date or not

# Load Libraries ---------------------------------------------------------
library(ggplot2)
library(tidyverse)

# Read in csv and filter to only topics, type and papers ----------------
getwd()
csv <- read.csv("wos1.csv")

# remove the "other" row for now
csv <- csv[1:16,]

# Plot -----------------------

ggplot(data = csv, aes(x = topic, y = n_papers, fill = mc_tfl))+
  geom_bar(position = "dodge", stat = "identity")

str(csv)





#' for subtopc sythesis 
#' - revise search terms for `conservation` and update the original data by NM
#' 

library(dplyr)


## 1. the original search by NM
f.org <- './data/testlaws_mctfl_preplotting_usedByYL.xlsx'
data.org <- readxl::read_excel(f.org) %>%
  dplyr::filter(st == 'Cons.') %>%
  dplyr::mutate(id = substring(text = title, first = 0, last = 30) %>%
                  tolower(x = .) %>%
                  gsub('[[:punct:] ]+',' ', .) %>%
                  stringr::str_squish(string = .)
  ) %>%
  dplyr::select(id, everything())
nrow(data.org) == unique(data.org$id) %>% length()


## 2. the new search by YL
data.new1 <- readxl::read_excel('./data/st_Conservation_mc_0812.xls')
data.new2 <- readxl::read_excel('./data/st_Conservation_tfl_0812.xls')
data.new  <- rbind(data.new1, data.new2) %>%
  dplyr::rename(title = `Article Title`,
                year2  = `Publication Year`) %>%
  dplyr::select(title, year2) %>%
  dplyr::mutate(id = substring(text = title, first = 0, last = 30) %>%
                  tolower(x = .) %>%
                  gsub('[[:punct:] ]+',' ', .) %>%
                  stringr::str_squish(string = .)
  ) %>%
  dplyr::select(id, everything())
nrow(data.new)
nrow(data.new) == unique(data.new$id) %>% length()


## 3. to update the original search 
data.org.update <- data.org %>%
  merge(
    x = ., 
    y = data.new %>% dplyr::rename(title2 = title), 
    all.x = T,
    by = 'id') %>%
  dplyr::filter(!is.na(title2)) %>%
  dplyr::select(-title2, -year2) %>%
  as.data.frame()


# sum(!is.na(data.org.update$title2))

writexl::write_xlsx(x = data.org.update, path = gsub('.xlsx', '_updateCons.xlsx', f.org))


# Title: testTFL_screen_testing_laws.R
# Author: Nick Manning & Yingjie Li
# Date: 6/13/2022
# Purpose: Read in the relevant articles CSV and separate it into 
# the _20p_shared files for YL. Screens the relevant files

# Requires:
## fully screened and merged CSV's (from `final_screened_results.R`)

# Results in: 
## "Does TFL Apply" and "What framework is more appropriate?" MC / TFL CSVs (in screen_apply)
## Plots of these results
## 20% YL CSV 


######################## START ##########################################

# 0) Load Libraries and Data -------------------------------------------------------------------------------------

## 0A) Load Libraries -------------------------------------------
rm(list =ls())

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(ggplot2)
library(stringr) # for splitting the column on if the laws hold or not 
library(tidyr) # for replace_na
library(patchwork) # plots in custom layout 

## 0B) Load Data from Previous Scripts ----------------------------

# Load Relevant Entries CSV
getwd()
relevant_mc_100p <- read.csv("Code/DerivedData/relevant_mc_100p.csv")
relevant_tfl_100p <- read_csv("Code/DerivedData/relevant_tfl_100p.csv")


# 1) Prep Data for Testing if MC and TFL Hold --------------------------------------------------------------------

## 1A) Split Data Into NM & YL Sections  ---------------------------

# calculate number of 20% of entries 
# n_mc_20p <- round(nrow(relevant_mc_100p)*0.2)
# n_tfl_20p <- round(nrow(relevant_tfl_100p)*0.2)
# 
# 
# # split into random 20% for YL
# relevant_mc_20p <- as.data.frame(sample_n(relevant_mc_100p, n_mc_20p))
# relevant_tfl_20p <- as.data.frame(sample_n(relevant_tfl_100p, n_tfl_20p))
# 
# 
# ## 1B) Change Data to Fit `revtools` Requirements ------------------------
# 
# # change data to be the required format by removing excluded from source
# # import source bib and remove excluded data 
# raw_mc <- read.csv("Code/SourceData/wos_mc.csv")
# full_relv_mc_100p <- semi_join(raw_mc, relevant_mc_100p, "unique_id") # for NM
# full_relv_mc_20p <- semi_join(raw_mc, relevant_mc_20p, "unique_id") # for YL
# 
# raw_tfl <- read.csv("Code/SourceData/wos_tfl.csv")
# full_relv_tfl_100p <- semi_join(raw_tfl, relevant_tfl_100p, "unique_id") # for NM
# full_relv_tfl_20p <- semi_join(raw_tfl, relevant_tfl_20p, "unique_id") # for YL




# 2) Do the Screening for Testing if Laws Apply (NM 100%, YL 20%) ------------------------------------------------

# NOTE: have to do this in notes section, not select/exclude/unknown section to get two choices

# NOTE: Saved to Code/DerivedData/testing_laws

# NOTE: Testing format: AB where 
#   A = Does this study obey TFL? and 
#   B = What framework would be more appropriate for this study?

## A: Entries can be either Y, N, R, U, X
    # if TFL holds, type Y in the A place; if TFL doesn't, A = N, review/other = R, unsure = U, exclude = X

## B: Entries can be either MC, TFL, Both, or Unsure ## NOT SURE ABOUT THIS YET
    # EXAMPLE: An entry obeys TFL, but MC might be more encompassing. entry = YMC
    # EX 2: It's a review paper = RR
    # EX 3: doesn't obey TFL, but MC is appropriate = NMC
    # EX 4: might be a spatial methods paper, so obeys TFL and TFL more appropriate = YTFL


## 2.1 screen ----------------------------------------------------------------------------

### For NM ### -----------------

#screen_abstracts(full_relv_mc_100p) #done round 1! 6/17/2022, still some U's & X's to revisit
#screen_abstracts()


#screen_abstracts(full_relv_tfl_100p)
#screen_abstracts()

##### SAVED AS 





### For YL ### ------------------

# NOTE: Only uncomment and run the `screen_abstracts(var)` for initialization, then 
# run `screen_abstracts()` with the empty parentheses and upload progress CSV

### 1. for MC papers ~~~~~~~~~~~~~~~~~~~~
# screen_abstracts(full_relv_mc_20p)
# screen_abstracts()


### 1. for TFL papers ~~~~~~~~~~~~~~~~~~~
# screen_abstracts(full_relv_tfl_20p)
#screen_abstracts()





## 2.2 compare inter-coder reliability ---------------------------------------------------
# NM will make a new script for comparing results




# 3) Import screened data & merge with relevant entries -------------------------------------------------------

## 3A) Import newly screened data ------------------

# import screened data 
testlaws_mc <- read.csv("scr_testTFL_mc100p_nm.csv")

testlaws_tfl <- read.csv("scr_testTFL_tfl100p_nm.csv")

# test to see how many entries in each ranking (how many XX or RR)
table(testlaws_mc$notes_og)
table(testlaws_tfl$notes_og)


testlaws_mc$notes_new <- paste0(testlaws_mc$TFL_apply, testlaws_mc$appropriate)
testlaws_tfl$notes_new <- paste0(testlaws_tfl$TFL_apply, testlaws_tfl$appropriate)

# change name in law-tested CSV so we can join to screened CSV
names(testlaws_mc)[names(testlaws_mc) == "notes_new"] <- "applies"
names(testlaws_tfl)[names(testlaws_tfl) == "notes_new"] <- "applies"


### Remerge into subtopics ### 


# join to relevant df and subset to only screened entries
testlaws_mc <- left_join(relevant_mc_100p, testlaws_mc[,c("unique_id", "applies")])

testlaws_tfl <- left_join(relevant_tfl_100p, testlaws_tfl[,c("unique_id", "applies")])

## 3B) Create subtopic column ---------------------

# import relevancy screened data with subtopic
relevant_mc_100p_st <- read_csv("Code/DerivedData/relevant_mc_100p_st.csv")
relevant_tfl_100p_st <- read_csv("Code/DerivedData/relevant_tfl_100p_st.csv")

# join subtopics to relevancy 
testlaws_mc <- left_join(testlaws_mc, relevant_mc_100p_st[,c("unique_id", "st")])
testlaws_tfl <- left_join(testlaws_tfl, relevant_tfl_100p_st[,c("unique_id", "st")])

# remove irrelevant articles
table(testlaws_mc$applies)

testlaws_mc <- subset(testlaws_mc, applies != "XX") 
testlaws_tfl <- subset(testlaws_tfl, applies != "XX")

# NOTE: might need to add something here to clean up any R's 

# we can see that the index column is different for mc (...1) and tfl (...2)
names(testlaws_mc)
names(testlaws_tfl)

# chnge index column to be the same
names(testlaws_tfl)[names(testlaws_tfl) == "...2"] <- "...1"

# join the testlaw mc and tfl variables  
testlaws_mctfl <- rbind(testlaws_mc, testlaws_tfl)


# 4) Format Data for Plotting  ----------------------------------------------------------------------------------

## 4A) Rename Subtopics ------------------------------------

testlaws_mctfl <- testlaws_mctfl %>% 
  mutate(st = case_when(
    st == "Ag. Development" ~ "Ag. Dev.",
    st == "Species Migration" ~ "Sp Migr.",
    st == "Conservation" ~ "Cons.",
    st == "Governance" ~ "Gov.",
    st == "Land Change" ~ "LC",
    TRUE ~ st
  ))

# rename all entries that didn't fit a subtopic as Miscellaneous 
testlaws_mctfl$st <- testlaws_mctfl$st %>% replace_na("Misc.")

# test if the new subjects transferred
unique(testlaws_mctfl$st)

## 4B)  Re-format Entry Data ----------------------

### change data to keep only Y, N, and U ###

### NOTE 1: changing the stars to Y's sometimes doesn't work, might need to run again if NA's in final graph ###

## extract first character of each entry using str_sub ##
# str_sub takes column name, starting position & string length as argument
testlaws_mctfl$applies_tfl <- str_sub(string = testlaws_mctfl$applies, start = 1, end = 1) # gets the first char (MC)
# testlaws_mctfl$approp <- str_sub(string = testlaws_mctfl$applies, start = 2) # gets the characters after 1st
# 
# 
# # test, should only be U (Unsure), MC, TFL, or B (Both -- experimental)
# unique(testlaws_mctfl$approp)

# test, should only be U (Unsure), Y (Yes), N (No)
unique(testlaws_mctfl$applies_tfl)

table(testlaws_mctfl$st)

x_testlaws_csv <- select(testlaws_mctfl, c("title", "abstract", "topic",
                                           "st","applies_tfl", "approp",
                                           "author", "year"))
#write_csv(x_testlaws_csv, "testlaws_mctfl_preplotting.csv")


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


######################## NEW SUBTOPIC SCREEN STEP: #######################

####################### UPDATE: WE CAN RUN THE CODE STARTING FROM THIS LINE WITH THE NEW CSV ######################################

rm(list =ls())

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(ggplot2)
library(stringr) # for splitting the column on if the laws hold or not 
library(tidyr) # for replace_na
library(patchwork) # plots in custom layout 

getwd()

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

# We uploaded this csv to Google Drive as an XLSX and manually removed topics that either:
## were picked up in a subtopic but weren't relevant bc they fit better in another subtopic == "better elsewhere"
## were decided to be a review == "review" 

# We then downloaded this screened CSV and removed the papers that should be excluded (below)

x_testlaws_csv <- read.csv("testlaws_mctfl_preplotting_screened_final_rescreen_nm_yl.csv", stringsAsFactors = F) #NOTE: hardcoded here bc I was getting 999 rows
x_testlaws_csv$topic[x_testlaws_csv$topic == "MC"] <- "MCF"

table(x_testlaws_csv$note1)

# replace with first three letters: 
## bet = better elsewhere
## rev = review
## not = relevant 
x_testlaws_csv$note1 <- substr(x_testlaws_csv$note1, 1, 3)

####### RESULTS: EXCLUDED FROM SUBTOPIC SCREENING ######
x_testlaws_csv %>% group_by(topic, note1) %>% summarise(count_excluded = n())
# add the better elsewhere, review, and not relevant for each to get the number removed
# the "" for each is the amount of papers screened

# remove all 'exclude' papers 
x_testlaws_csv <- x_testlaws_csv[x_testlaws_csv$exclude != "X",]
x_testlaws_csv %>% group_by(st, topic) %>% summarise(count_st = n())

testlaws_mctfl <- x_testlaws_csv

# # # # # # # # # # 
# # NOTE: the following steps are valid in "screen_testing_laws.R" but not here 
# # because I didn't use * or ! in the new data structure (new as in used in )


# # remove all X's -- taken care of earlier in the script 
# #testlaws_mctfl <- subset(testlaws_mctfl, applies_mc != "X") 
# testlaws_mctfl <- subset(testlaws_mctfl, applies_tfl != "X") 
# testlaws_mctfl <- subset(testlaws_mctfl, approp != "X") 

# # change all stars (*) to Y's 
# testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "\\*", "Y")
# testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "\\*", "Y") #have to run this line twice for some reason
 

# # change all !'s (prev. "^"s) to N's
# testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "!", "N")


# NOTE 2: Convenience problem here, I couldn't get "OR" operator to work so I just did it in steps
# remove all R's -- marked as X in the test 
#testlaws_mctfl <- subset(testlaws_mctfl, applies_mc != "R") 
#testlaws_mctfl <- subset(testlaws_mctfl, applies_tfl != "R") 
# # # # # # # # # # # # # # # # # # # # 




## 4C) Summarize into Count and Percent ---------------------------

#### NOTE: Could probably make this into a function and clean it up ##### #

###  Obeys TFL   ###
testlaws_count_st <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, st) %>% 
  summarise(count_st = n()) %>% 
  # then, create the percent column
  mutate(perc_st = round(count_st / sum(count_st)*100, 0))

testlaws_count_st


testlaws_count_tfl <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, st, applies_tfl) %>% 
  summarise(count_tfl = n()) %>% 
  # then, create the percent column
  mutate(perc_tfl = round(count_tfl / sum(count_tfl)*100, 1)) %>% 
  # then, change the abbreviations to full words
  # changed to include unsures for Y and N 
  mutate(applies_tfl = case_when(
    #applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "NU" ~ "Likely No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "BU" ~ "Both",
    applies_tfl == "YU" ~ "Likely Yes",
    applies_tfl == "Y" ~ "Yes")) 

testlaws_count_tfl




# reorder the data for plotting  
testlaws_count_tfl$applies_tfl <- factor(testlaws_count_tfl$applies_tfl,
                                       levels = c("No", "Likely No", "Unsure", 
                                                  "Both", 
                                                  "Likely Yes", "Yes"))

### Both Topics ### 

# Both Topics -- TFL #
testlaws_count_mctfl_tfl <- testlaws_mctfl %>% 
  group_by(st, applies_tfl) %>% 
  summarise(count_tfl = n()) %>% 
  # then, create the percent column
  mutate(perc_tfl = round(count_tfl / sum(count_tfl)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_tfl = case_when(
    #applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "NU" ~ "Likely No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "BU" ~ "Both",
    applies_tfl == "YU" ~ "Likely Yes",
    applies_tfl == "Y" ~ "Yes"))

# reorder the data for plotting  
testlaws_count_mctfl_tfl$applies_tfl <- factor(testlaws_count_mctfl_tfl$applies_tfl,
                                               levels = c("No", "Likely No", "Unsure", 
                                                          "Both", 
                                                          "Likely Yes", "Yes"))

# 5) Plot Data per Topic --------------------------------------------------------------------------------------------

## 5A) Specify Formatting Variables --------------


### ~~~ color ------------
# Create variable for plotting colors - could switch to a palette later

# used https://colorbrewer2.org/#type=diverging&scheme=PuOr&n=7 for colorblind-freindly
# palette and used plotrix package and color.id fxn to go from hex code to R color name

# Order:            Unsure,       No,       Partly,        Yes
# plot_colors = c("antiquewhite3","tomato","goldenrod1","chartreuse3") 
#plot_colors <- c("gray97","pink2","lemonchiffon","darkolivegreen3") # colorblind friendly
#plot_appr <- c("gray97","mediumaquamarine","darkorchid2")
#plot_uny = c("gray97","pink2","darkolivegreen3") # first attempt at colorblind-safe

# colorblind safe from Okabe & Ito 2008: https://jfly.uni-koeln.de/color/
#palette.colors(palette = "Okabe-Ito") 
#plot_uny <- c("#F0E442","#CC79A7","#009E73")

#color blind safe from rcartocolor package
# safe_colorblind_palette <- c("#88CCEE", "#CC6677", "#DDCC77", "#117733", "#332288", "#AA4499", 
#                               "#44AA99", "#999933", "#882255", "#661100", "#6699CC", "#888888")
# scales::show_col(safe_colorblind_palette)

#ORDER: No, unsure no, unsure, unsure yes, yes
plot_nuy <- c("#ac394c", "#CC6677", "#888888", "#DDCC77", "#44AA99", "#117733")
# NO: 661100 = dark red
# UNSURE NO: CC6677 = light red
# UNSURE: 888888 = gray
# Both: DDCC77 = yellow
# UNSURE YES: 44AA99 = blue-green / teal
# YES: 117733 = green

## 5B) Plot the Number & Percent of Entries -----------------------

### NUMBER ###
plot_st_count <- testlaws_count_st %>%
  ## add a column to help reorder x-axis
  dplyr::mutate(order_by = ifelse(topic == 'MCF', count_st, 0)) %>%
  ggplot(aes(x = reorder(st, order_by), y = count_st))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=count_st),position="stack",hjust=0)+
  facet_wrap(~ topic)+
  labs(
    x = "Sustainability Topic",
    y = "Number of Entries",
    title = "Number of Topic Entries for MCF & TFL", # Applicability of TFL Across Sustainability
    #subtitle = "Percent with MC and TFL articles mixed"
  )+
  coord_flip() +
  theme_bw()

plot_st_count

#ggsave(filename = "data/figures/testlaw_mc_count_0915.png", plot = plot_st_count, width = 12.5, height = 5, units = 'in', dpi = 300)


### PERCENT ###
plot_st_perc <- testlaws_count_st %>%
  ## add a column to help reorder x-axis
  dplyr::mutate(order_by = ifelse(topic == 'MCF', perc_st, 0)) %>%
  ggplot(aes(x = reorder(st, order_by), y = perc_st))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=perc_st),position="stack",hjust=0)+
  ylim(0, 68) +
  facet_wrap(~ topic)+
  labs(
    x = "Sustainability Topic",
    y = "Percentage of Entries",
    title = "Percentage of Topic Entries for MCF & TFL", # Applicability of TFL Across Sustainability
    #subtitle = "Percent with MC and TFL articles mixed"
  )+
  coord_flip() +
  theme_bw()

plot_st_perc

#ggsave(filename = "data/figures/testlaw_mc_perc_0915.png", plot = plot_st_perc, width = 9, height = 5, units = 'in', dpi = 300)




## 5C) Plot Tobler's First Law ------------------------------------------------------------

### ~~ fig 4cd - each topic --------
bar_width <- 0.85
library(ggrepel)
# Does TFL Apply?? -- Percent grouped by topic (MC & TFL)

plot_tflapply_perc <- 
  testlaws_count_tfl %>%
  dplyr::mutate(color_mark = ifelse(perc_tfl < 3, 'black', 'white')) %>%
  ggplot(aes(x = st, y = perc_tfl, 
         fill = applies_tfl,
         
         )) +
  geom_bar(stat = "identity", width = bar_width) +
  geom_text(size = 2.5,
            aes(label = paste(round(perc_tfl), "%"), #need for % in stacked area
                group = applies_tfl, 
                color = color_mark),
            # colour = "gray10",
            position = position_stack(vjust = 0.5))+ #need for % in stacked area
  
  # geom_text_repel(aes(label = paste(round(perc_tfl), "%")), 
  #                 size = 2.5, 
  #                 # position = position_stack(vjust = 0.5),
  #                 position = position_stack_and_nudge(vjust = 0.5, y = 0.5),
  #                 ) +
  
  facet_wrap(~ topic)+
  
  labs(
    x = "Sustainability Topic",
    y = "Percent of Entries",
    #title = "The Applicability of TFL across Sustainability Domains",
    #subtitle = "Percent grouped by MCF and TFL"
  )+
  theme_bw()+
  scale_fill_manual("Does this entry\n obey TFL?", values = plot_nuy)+
  # scale_color_manual(values = c('white', 'black')) +
  scale_color_manual(values = c('black', 'white')) +
  # theme(
  #   legend.title = element_blank(), 
  #   axis.title.x = element_blank(), #hide x axis title, better for stacking
  #   #legend.position = "none" #hide the legend
  #   legend.position="top", # put the legend at the top
  #   legend.justification = "center", # put the legend in the middle 
  #   plot.title = element_text(hjust = 0.5) # put title in middle 
  # )
  theme(
    #legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
    legend.position = "bottom"
  )


plot_tflapply_perc





### ~~ fig 4a - both topics -------------

## 5D) Plot if MCF and TFL Apply Across Mixed Topics

plot_tflapply_mixedperc <- testlaws_count_mctfl_tfl %>%
  dplyr::mutate(topic = 'Both Approaches') %>%
  ggplot(aes(x = st, y = perc_tfl, 
            fill = applies_tfl, 
            label = paste(round(perc_tfl), "%")
            )) + 
    geom_bar(stat = "identity", show.legend = F) + 
    geom_text(size = 2.5, position = position_stack(vjust = 0.5), show.legend = F,
              # colour = "black",
              aes(colour = perc_tfl < 3,
                  group = applies_tfl))+
    scale_color_manual(values = c('white', 'black')) +
    facet_wrap(~ topic)+
    labs(
      x = "Sustainability Topic",
      y = "Percent of Entries",
      # title = "The Applicability of TFL across Sustainability Domains"
      #subtitle = "Percent with MCF and TFL articles mixed"
    )+
    theme_bw()+
    scale_fill_manual("Does this entry\n obey TFL?",values = plot_nuy)+
  # theme(
  #   legend.title = element_blank(),
  #   legend.position = "none"
  # )
    theme(
      #legend.title = element_blank(), 
      # axis.title.x = element_blank(), #hide x axis title, better for stacking
      axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
      #legend.position = "none" #hide the legend
      # legend.position="top", # put the legend at the top
      legend.justification = "right", # put the legend in the middle 
      plot.title = element_text(hjust = -0.5) # put title in middle 
    )

plot_tflapply_mixedperc



### ~~ fig 4b - combined ----------

## 5E) see how many MCF and TFL papers overall fall into categories 

testlaws_countperc_topic <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, applies_tfl) %>% 
  summarise(count_topic = n()) %>% 
  # then, create the percent column
  mutate(perc_topic = round(count_topic / sum(count_topic)*100, 0)) %>% 
  mutate(applies_tfl = case_when(
    #applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "NU" ~ "Likely No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "BU" ~ "Both",
    applies_tfl == "YU" ~ "Likely Yes",
    applies_tfl == "Y" ~ "Yes"))

# reorder the data for plotting  
testlaws_countperc_topic$applies_tfl <- factor(testlaws_countperc_topic$applies_tfl,
                                               levels = c("No", "Likely No", "Unsure", 
                                                          "Both", 
                                                          "Likely Yes", "Yes"))
testlaws_countperc_topic


# plot the percent of MCF and TFL entries that do and do not obey TFL
plot_tflapply_topics_perc <- 
  testlaws_countperc_topic %>%
  dplyr::mutate(facet_col = "MCF & TFL Combined Topics") %>%
  ggplot(aes(x = topic, y = perc_topic,
             fill = applies_tfl,
             label = paste(round(perc_topic), "%") #need for % in stacked area
             )) +
  geom_bar(stat = "identity", width = bar_width, show.legend = F) +
  geom_text(size = 2.5, 
            # colour = "black",
            aes(colour = perc_topic < 5, group = applies_tfl),
            show.legend = F,
            position = position_stack(vjust = 0.5))+ #need for % in stacked area
  scale_color_manual(values = c('white', 'black')) +
  facet_wrap(~ facet_col)+
  labs(
    x = "Main Approach",
    y = "Percent of Entries",
    # title = "MCF & TFL Combined Topics",
    #subtitle = "Percent MCF and TFL"
  )+
  theme_bw()+
  scale_fill_manual("Does this entry\n obey TFL?",values = plot_nuy)+
  # theme(
  #   legend.title = element_blank(), 
  #   axis.title.x = element_blank(), #hide x axis title, better for stacking
  #   #legend.position = "none" #hide the legend
  #   legend.position="top", # put the legend at the top
  #   legend.justification = "center", # put the legend in the middle 
  #   plot.title = element_text(hjust = 0.5) # put title in middle 
  # )
  theme(
    #legend.title = element_text("Does this entry\n obey TFL?"),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
    #legend.position = "none",
    plot.title = element_text (hjust = 0.5) 
  )

plot_tflapply_topics_perc

# ggsave(filename = "data/figures/testlaw_mctfl_topics_perc_0915.png", plot = plot_tflapply_topics_perc, width = 6.4, height = 5, units = 'in', dpi = 300)


### ~~ fig 4 abcd ---------------

## 5F) Wrap Obey Figures into final figure -------------------------

# plot_obey_patch <-  plot_tflapply_mixedperc/ plot_tflapply_perc
# plot_obey_patch

library(ggpubr)
plot_obey_patch <- ggarrange(
  ggarrange(plot_tflapply_mixedperc, plot_tflapply_topics_perc, ncol = 2, widths = c(1.08, 0.92), align = 'h'), 
  plot_tflapply_perc, ncol = 1, heights = c(1, 1), legend = "bottom", common.legend = T)
plot_obey_patch
#ggsave(filename = "data/figures/testlaw_mctfl_perc_stacked_YL_0915.png", plot = plot_obey_patch, width = 6.4, height = 7.7, units = 'in', dpi = 300)




## 5G) Create and Plot bar graphs for the general number of MCF and TFL entry results -------------------------
# see how many MCF and TFL papers overall fall into categories
testlaws_countperc_topic <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, applies_tfl) %>% 
  summarise(count_topic = n()) %>% 
  # then, create the percent column
  mutate(perc_topic = round(count_topic / sum(count_topic)*100, 0)) %>% 
  mutate(applies_tfl = case_when(
    #applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "NU" ~ "Likely No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "BU" ~ "Both",
    applies_tfl == "YU" ~ "Likely Yes",
    applies_tfl == "Y" ~ "Yes"))

# reorder the data for plotting  
testlaws_countperc_topic$applies_tfl <- factor(testlaws_countperc_topic$applies_tfl,
                                               levels = c("No", "Likely No", "Unsure", 
                                                          "Both", 
                                                          "Likely Yes", "Yes"))
testlaws_countperc_topic



# plot the actual count of MCF and TFL entries 
plot_tflapply_topics_count <- ggplot(testlaws_countperc_topic, 
                                     aes(x = topic, y = count_topic, 
                                         fill = applies_tfl,
                                         label = count_topic #need for % in stacked area
                                     )) +
  geom_bar(stat = "identity", width = bar_width) +
  geom_text(size = 3, position = position_stack(vjust = 0.5), colour = "white")+ #need for % in stacked area
  #facet_wrap(~ topic)+
  labs(
    x = "Main Approach",
    y = "Number of Entries",
    title = "The Applicability of TFL under MCF and TFL Literature",
    #subtitle = "Percent MC and TFL"
  )+
  theme_bw()+
  scale_fill_manual("Does this entry obey TFL?",values = plot_nuy)+
  # theme(
  #   legend.title = element_blank(), 
  #   axis.title.x = element_blank(), #hide x axis title, better for stacking
  #   #legend.position = "none" #hide the legend
  #   legend.position="top", # put the legend at the top
  #   legend.justification = "center", # put the legend in the middle 
  #   plot.title = element_text(hjust = 0.5) # put title in middle 
  # )
  theme(
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
    #legend.position = "none"
  )

plot_tflapply_topics_count

#ggsave(filename = "data/figures/testlaw_mctfl_topics_count_0915.png", plot = plot_tflapply_topics_count, width = 6.4, height = 5, units = 'in', dpi = 300)





## 6) Export & Save CSVs and Plots -------------------------------

# MC test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_mc_perc.png", plot_mcapply_perc)

# TFL test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_tfl_perc.png", plot_tflapply_perc)

# MC test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_mc_mixedperc.png", plot_mcapply_mixedperc)

# TFL test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_tfl_mixedperc.png", plot_tflapply_mixedperc)

# MC and TFL test law percentage plots stacked on top of each other 
ggsave("data/figures/testlaw_mctfl_perc_stacked.png", plot_obey_patch)

# NOTE: If there is an NA on the TFL plot, refer to 4B Note 1, should be taken care of


########################     END     #####################################################################################






# Extra Code: Plots & Whatnots -------------------------------------------------

# Does MC Apply?? -- Percent grouped by subtopic -- 2nd most helpful
# plot_mcapply_perc_st <- ggplot(testlaws_count_mc, aes(x = topic, y = perc_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ st)+
#   theme_bw()+
#   labs(
#     x = "Sustainability Topic",
#     y = "Percent of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Percent grouped by subtopic"
#   )+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )


# Does MC Apply?? -- Number grouped by MC and TFL
# ggplot(testlaws_count_mc, aes(x = st, y = count_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ topic)+
#   labs(
#     x = "Sustainability Subtopic",
#     y = "Number of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Number grouped by MC and TFL"
#   )+
#   theme_bw()+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )


# # Does MC Apply?? -- Number grouped by subtopic
# ggplot(testlaws_count_mc, aes(x = topic, y = count_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ st)+
#   labs(
#     x = "Sustainability Subtopic",
#     y = "Number of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Number grouped by subtopic"
#   )+
#   theme_bw()+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )
#   

############# APPROPRIATENESS ###########

# # # # # # # # # # # # # # # # # # # # # 
# ###   Appropriateness   ###
# testlaws_count_appr <- testlaws_mctfl %>% 
#   # first, create the count column
#   group_by(topic, st, approp) %>% 
#   summarise(count_appr = n()) %>% 
#   # then, create the percent column
#   mutate(perc_appr = round(count_appr / sum(count_appr)*100, 1)) %>% 
#   # then, change the abbreviations to full words
#   mutate(approp = case_when(
#     approp == "MC" ~ "MC",
#     approp == "TFL" ~ "TFL",
#     approp == "U" ~ "Unsure",
#     approp == "UU" ~ "Unsure")) 
# 
# # reorder the data for plotting  
# testlaws_count_appr$approp <- factor(testlaws_count_appr$approp,
#                                        levels = c("Unsure", "MC", "TFL"))
# 
# # Plot Appropriateness?? 
# 
# plot_appr_perc <- ggplot(testlaws_count_appr, 
#                          aes(x = st, 
#                              y = perc_appr/2, # y-axis 200% bc 100% MC + 100% TFL, so div. by 2  
#                              fill = approp)) + 
#   geom_bar(stat = "identity") + 
#   # facet_wrap(~ topic)+ # wrap into sections by topic
#   labs( # add labels to x and y axes and add a title and subtitle
#     x = "Sustainability Subtopic",
#     y = "Percent of Entries",
#     title = "Which Framework is more appropriate?", # Most Appropriate Framework
#     subtitle = "Percent grouped by MC and TFL"
#   )+  
#   theme_bw()+ # add a cleaner theme than the gray background default
#   scale_fill_manual(values = plot_appr)+ # add a custom color scheme
#   theme(
#     legend.title = element_blank() # remove legend title 
#   )
# #added parentheses so we can create the variable and run the plot in the same section
# 
# plot_appr_perc
# # # # # # # # # # # # # # # # # # # # # 


# Title: 04_Heatmap_SubtopicYears.R
# Author: Nick Manning
# Date: 10/10/2022
# Purpose: Read in the final CSV and plot the number of papers in each
# subtopic per each year

# Requires:
## final screened CSV x_testlaws.csv

# Results in: 
## figure of papers over time 
########################################3

# 0) Load Libraries -----------
rm(list = ls())
getwd()

library(dplyr)
library(ggplot2)

# 1) Bring in CSV and clean according to 03_testTFL script -----------
df <- read.csv("testlaws_mctfl_preplotting_screened_final_rescreen_nm_yl.csv", stringsAsFactors = F) #NOTE: hardcoded here bc I was getting 999 rows

df$topic[df$topic == "MC"] <- "MCF"

table(df$note1)

# replace with first three letters: 
## bet = better elsewhere
## rev = review
## not = relevant 
df$note1 <- substr(df$note1, 1, 3)

####### RESULTS: EXCLUDED FROM SUBTOPIC SCREENING ######
df %>% group_by(topic, note1) %>% summarise(count_excluded = n())
# add the better elsewhere, review, and not relevant for each to get the number removed
# the "" for each is the amount of papers screened

# remove all 'exclude' papers 
df <- df[df$exclude != "X",]
df %>% group_by(st, topic) %>% summarise(count_st = n())

testlaws_mctfl <- df

# 2) group by years to see trend over time 
testlaws_mctfl <- select(testlaws_mctfl, c("title", "topic", "st", "applies_tfl", "year"))
df.year <- testlaws_mctfl %>% group_by(st, topic, year) %>% summarise(count_year = n())


# select just mc and just tfl topics
df.year.tfl <- df.year[df.year$topic == "TFL",]
df.year.mc <- df.year[df.year$topic == "MCF",]
df.year.mctfl <- testlaws_mctfl %>% group_by(st, year) %>% summarise(count_year = n())

# 3) plot heat map ---------

# plot MC as heat map
heat.mc <- ggplot(df.year.mc, aes(year, st)) + 
  geom_tile(aes(fill = count_year))+
  scale_x_continuous(breaks = seq(min(df.year.mc$year), max(df.year.mc$year), by = 1))+
  #scale_fill_gradient(breaks = seq(0,18, by = 1))+
  theme_bw()+
  coord_fixed()+
  geom_text(aes(label = count_year), color = "white")+
  labs(x = "Year of Publication", 
       y = "Sustainability Topic",
       title = "Publication Dates (MCF)")+
  guides(fill = guide_colorbar(title = "# of Entries"))+
  theme(axis.text.x = element_text(angle = 90))

heat.mc

# plot TFL as heat map 
heat.tfl <- ggplot(df.year.tfl, aes(year, st)) + 
  geom_tile(aes(fill = count_year))+
  scale_x_continuous(breaks = seq(min(df.year.tfl$year), max(df.year.tfl$year), by = 1))+
  #scale_fill_gradient(breaks = seq(0,18, by = 1))+
  theme_bw()+
  coord_fixed()+
  geom_text(aes(label = count_year), color = "white")+
  labs(x = "Year of Publication", 
       y = "Sustainability Topic",
       title = "Publication Dates (TFL)")+
  guides(fill = guide_colorbar(title = "# of Entries"))+
  theme(axis.text.x = element_text(angle = 90))

heat.tfl


# plot both data together 
heat.mctfl <- ggplot(df.year.mctfl, aes(year, st)) + 
  geom_tile(aes(fill = count_year))+
  scale_x_continuous(breaks = seq(min(df.year.mctfl$year), max(df.year.mctfl$year), by = 1))+
  #scale_fill_gradient(breaks = seq(0,20, by = 5))+
  theme_bw()+
  coord_fixed()+
  geom_text(aes(label = count_year), color = "white")+
  labs(x = "Year of Publication", 
       y = "Sustainability Topic",
       title = "Publication Dates (MCF & TFL)")+
  guides(fill = guide_colorbar(title = "# of Entries"))+
  theme(axis.text.x = element_text(angle = 90))


heat.mctfl


# 3.1) Facet Wrap ------------------
(heat.wrap <- ggplot(df.year, aes(year, st)) + 
  geom_tile(aes(fill = count_year))+
  scale_x_continuous(breaks = seq(min(df.year$year), max(df.year$year), by = 1))+
  #scale_fill_gradient(breaks = seq(0,18, by = 1))+
  facet_wrap(~topic, ncol = 1)+
  theme_bw()+
  #coord_fixed()+
  geom_text(aes(label = count_year), color = "white")+
  labs(x = "Year of Publication", 
       y = "Sustainability Topic",
       title = "Publication Dates")+
  guides(fill = guide_colorbar(title = "# of Entries"))+
  theme(axis.text.x = element_text(angle = 90))
)
heat.wrap

ggsave(filename = "data/figures/heatmap_topicsdates.png", plot = heat.wrap, width = 6.4, height = 5, units = 'in', dpi = 300)


# Title: 04_GraphAbs
# Date: 10/13/22
# Purpose: Create a simple barplot for the graphical abstract
# Author: Nick Manning

##############################33

# 0) Load Libraries and Data -------------------------------------------------------------------------------------

## 0A) Load Libraries -------------------------------------------
rm(list =ls())

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(ggplot2)
library(stringr) # for splitting the column on if the laws hold or not 
library(tidyr) # for replace_na
library(patchwork) # plots in custom layout 


## 0B) Load Data from Previous Scripts ----------------------------
x_testlaws_csv <- read.csv("testlaws_mctfl_preplotting_screened_final_rescreen_nm_yl.csv", stringsAsFactors = F) #NOTE: hardcoded here bc I was getting 999 rows
x_testlaws_csv$topic[x_testlaws_csv$topic == "MC"] <- "MCF"

table(x_testlaws_csv$note1)


# 1) Format Data ---------------------------------

# 1.1) replace with first three letters ----------
## bet = better elsewhere
## rev = review
## not = relevant 
x_testlaws_csv$note1 <- substr(x_testlaws_csv$note1, 1, 3)

# remove all 'exclude' papers 
x_testlaws_csv <- x_testlaws_csv[x_testlaws_csv$exclude != "X",]
x_testlaws_csv %>% group_by(st, topic) %>% summarise(count_st = n())

testlaws_mctfl <- x_testlaws_csv


# 1.2) Add additional columns and summaries necessary to plot --------------
# Both Topics -- TFL #
testlaws_count_mctfl_tfl <- testlaws_mctfl %>% 
  group_by(st, applies_tfl) %>% 
  summarise(count_tfl = n()) %>% 
  # then, create the percent column
  mutate(perc_tfl = round(count_tfl / sum(count_tfl)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_tfl = case_when(
    #applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "NU" ~ "Likely No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "BU" ~ "Both",
    applies_tfl == "YU" ~ "Likely Yes",
    applies_tfl == "Y" ~ "Yes"))

# reorder the data for plotting  
testlaws_count_mctfl_tfl$applies_tfl <- factor(testlaws_count_mctfl_tfl$applies_tfl,
                                               levels = c("No", "Likely No", "Unsure", 
                                                          "Both", 
                                                          "Likely Yes", "Yes"))


# 2) Plot! --------------------------------------------

# 2.1) Set Color Scheme -----------------

# 2.1.1 Manually set from __ package
#ORDER: No, unsure no, unsure, unsure yes, yes
plot_nuy <- c("#ac394c", "#CC6677", "#888888", "#DDCC77", "#44AA99", "#117733")
# NO: 661100 = dark red
# UNSURE NO: CC6677 = light red
# UNSURE: 888888 = gray
# Both: DDCC77 = yellow
# UNSURE YES: 44AA99 = blue-green / teal
# YES: 117733 = green

# 2.2) Re-order dataset as factor based on Jack's Comment -------
testlaws_count_mctfl_tfl$st <- factor(testlaws_count_mctfl_tfl$st, 
                          levels=c("Ag. Dev.", "Cons.", "Gov.", "LC",
                                   "Trade", "Sp Migr.", "Tourism", "Misc."))


# 2.3) Plot if MCF and TFL Apply Across Mixed Topics ------------
textsize = 12

plot_tflapply_mixedperc <- testlaws_count_mctfl_tfl %>%
  dplyr::mutate(topic = 'Both Approaches') %>%
  ggplot(aes(x = st, y = perc_tfl, 
             fill = applies_tfl)) + 
  geom_bar(stat = "identity", show.legend = T) + 
  scale_color_manual(values = c('white', 'black')) +
  facet_wrap(~ topic)+
  labs(
    x = "Sustainability Topic",
    y = "Percent of Entries")+
  theme_bw()+
  scale_fill_manual("Does this entry\n obey TFL?",values = plot_nuy)+
  theme(
    legend.title = element_blank(),
    legend.text = element_text(size = textsize), #NEW
    strip.text.x = element_text(size = textsize), # change facet_wrap text size 
    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = textsize),
    axis.text.y = element_text(size = textsize - 1),
    axis.title.y = element_text(size = textsize),
    #axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom", #hide the legend
    #legend.justification = "right", # put the legend in the middle 
    plot.title = element_text(hjust = -0.5), # put title in middle 
    
    # make transparent 
    legend.background = element_rect(fill = "transparent"),
    legend.box.background = element_rect(fill = "transparent"),
    panel.background = element_rect(fill = "transparent"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = "transparent", color = NA)
  )+
  guides(fill = guide_legend(nrow = 1))

plot_tflapply_mixedperc

getwd()
ggsave("data/figures/graph_abs_barplot.png", bg = 'transparent',
       width = 6.25, height = 4, units = "in", dpi = 300)


# Title: compare_results
# Author: Nick Manning & Yingjie Li
# Date: 4/26/2022
# Purpose: Compare results in the 20% overlap and update based on our meeting

# Required:
## 20p CSV's from screening_titleabs_.R

# Results in:
## updated 20p CSV's stored in /screen_titleabs_summ
## alignment statistics for manuscript

##########################

rm(list=ls())
library(revtools)
library(dplyr) #for the 'select' fxn
library(readr) ## `read_csv()` is much better than the base `read.csv`, which aften cause errors. 


# Load in CSV's ----------------------
getwd()

mc_20p_yl <- read_csv("data/screen_titleabs_summ/screened_summ_mc_20p_yl.csv")
mc_20p_nm <- read_csv("data/screen_titleabs_summ/screened_summ_mc_20p_nm.csv")

tfl_20p_yl <- read_csv("data/screen_titleabs_summ/screened_summ_tfl_20p_yl.csv")
tfl_20p_nm <- read_csv("data/screen_titleabs_summ/screened_summ_tfl_20p_nm.csv")


# Rename the screenedabstracts column with author initials -----------------
# names(mc_20p_yl)[names(mc_20p_yl) == 'screened_abstracts'] <- 'screened_abstracts_yl'
# names(mc_20p_nm)[names(mc_20p_nm) == 'screened_abstracts'] <- 'screened_abstracts_nm'
# 
# names(tfl_20p_yl)[names(tfl_20p_yl) == 'screened_abstracts'] <- 'screened_abstracts_yl'
# names(tfl_20p_nm)[names(tfl_20p_nm) == 'screened_abstracts'] <- 'screened_abstracts_nm'




# Compare ####################

## for mc --------------------
mc_20p <- merge(
  x = mc_20p_nm,
  y = mc_20p_yl,
  by = !names(mc_20p_nm) %in% c("screened_abstracts", "notes")
) %>%
  dplyr::mutate(same = ifelse(`screened_abstracts.x` == `screened_abstracts.y`, 1, 0)) 

mc_20p_alignment <- mc_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(mc_20p_nm))

mc_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for MC') ## 94.55%


## for tfl --------------------
tfl_20p <- merge(
  x = tfl_20p_nm,
  y = tfl_20p_yl,
  by = !names(tfl_20p_nm) %in% c("screened_abstracts", "notes")
) %>%
  dplyr::mutate(same = ifelse(`screened_abstracts.x` == `screened_abstracts.y`, 1, 0)) 

tfl_20p_alignment <- tfl_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(tfl_20p_nm))

tfl_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for TFL') ## 94.44%


# Keep only selected papers #####################

# update TFL decisions after 5/11/2022 meeting 
tfl_20p_updated <- tfl_20p
tfl_20p_updated$screened_abstracts.x[tfl_20p_updated$unique_id == "WOS:000407512100001"] <- "selected"

# update MC decisions after 5/11/2022 meeting 
mc_20p_updated <- mc_20p
mc_20p_updated$screened_abstracts.y[mc_20p_updated$unique_id == "WOS:000641612900001" | 
                                      mc_20p_updated$unique_id == "WOS:000332185100002" | 
                                      mc_20p_updated$unique_id == "WOS:000749000800004"] <- "selected"

write_csv(mc_20p_updated, "data/screen_titleabs_summ/screened_summ_mc_20p_both.csv")
write_csv(tfl_20p_updated, "data/screen_titleabs_summ/screened_summ_tfl_20p_both.csv")



# Title: final_results
# Author: Nick Manning & Yingjie Li
# Date: 5/11/2022
# Purpose: import separate screened CSV's and outputs one final 
# CSV each for the MC and TFL datasets

# Requires: 
## screened summary CSV's (from screening_titleabs_nm & screening_titleabs_YL.R)
## raw CSV's of the bibs (from sample_manually.R)

# Results in:
## _selected_full.csv files stored in /screen_titleabs_summ and used 
## in screen_subtopics.R 

##########################

library(readr) #read_csv
library(revtools) # import bibs

#########################

rm(list = ls())
getwd()

#dir.data <- 

# Import screened CSV's ------------------

# shared 20%
mc_20p <- read_csv('data/screen_titleabs_summ/screened_summ_mc_20p_both.csv') 
tfl_20p <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_20p_both.csv') 

# MC
mc_40p_nm <- read_csv('data/screen_titleabs_summ/screened_summ_mc_40p_nm.csv') 
mc_40p_nm <- mc_40p_nm[mc_40p_nm$screened_abstracts == "selected",]

mc_40p_yl <- read_csv('data/screen_titleabs_summ/screened_summ_mc_40p_yl.csv')
mc_40p_yl <- mc_40p_yl[mc_40p_yl$screened_abstracts == "selected",]


sel_mc_80p <- rbind(mc_40p_nm, mc_40p_yl)


# TFL
tfl_40p_nm <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_40p_nm.csv')
tfl_40p_nm <- tfl_40p_nm[tfl_40p_nm$screened_abstracts == "selected",]


tfl_40p_yl  <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_40p_yl.csv')
tfl_40p_yl <- tfl_40p_yl[tfl_40p_yl$screened_abstracts == "selected",]


sel_tfl_80p <- rbind(tfl_40p_nm, tfl_40p_yl)


# Create one df for MC and for TFL by selecting then merging --------------

# keep only selected values
sel_mc_20p <- mc_20p[mc_20p$screened_abstracts.x == "selected",]
sel_tfl_20p <- tfl_20p[tfl_20p$screened_abstracts.x == "selected",]

# drop all columns that aren't the same from before
sel_mc_20p <- sel_mc_20p[1:9]
names(sel_mc_20p) <- names(sel_mc_80p)

sel_tfl_20p <- sel_tfl_20p[1:9]
names(sel_tfl_20p) <- names(sel_tfl_80p)

# merge with the 80p from above
sel_mc_100p <- rbind(sel_mc_80p, sel_mc_20p)
sel_tfl_100p <- rbind(sel_tfl_80p, sel_tfl_20p)

# Add back in the abstracts from the source bibs ---------------------
getwd()

# import source csv's
mc_source <- read_csv("Code/SourceData/wos_mc.csv")
mc_source <- select(mc_source, c("unique_id", "abstract"))

tfl_source <- read_csv("Code/SourceData/wos_tfl.csv")
tfl_source <- select(tfl_source, c("unique_id", "abstract"))

# add abstract column to selected papers
sel_mc_100p <- left_join(sel_mc_100p, mc_source, by = "unique_id")
sel_tfl_100p <- left_join(sel_tfl_100p, tfl_source, by = "unique_id")

# Export final CSV's -----------------------
getwd()

write_csv(sel_mc_100p, "data/screen_titleabs_summ/mc_selected_full.csv")
write_csv(sel_tfl_100p, "data/screen_titleabs_summ/tfl_selected_full.csv")





# Title: master_scr_relevance.R
# Author: Nick Manning & Yingjie Li
# Date: 6/13/2022

# Purpose: Serve as a one-stop for the entire process instead of several 
# scripts. Might be a bad idea. 

# Requires:
## raw CSV's of the bibs (from sample_manually.R)
## raw .bib files (from WoS)
## fully screened and merged CSV's (from final_screened_results.R)

# Results in: 
## subtopic CSV's and XLSX's
## figures for count and percent
## excluded entry CSV's

# Part 1: ~ sample_manually.R
# Part 2: ~ screening_titleabs_nm.R & screening_titleabs_YL.R
# Part 3: ~ cmpare_results_20p.R
# Part 4: ~ final_screened_results.R
# Part 5: ~ screen_subtopics.R

##### LOG #####

# 06132022
# Created file, copy & pasted all previous code and created headers


#############################################################################

# PART 0: SET-UP -------------------------------------------------------------------
# 0.1) Working Directory Info -----------

rm(list =ls())
getwd()

dir.deriveddata <- "./Code/DerivedData/"
dir.sourcedata <- "./Code/SourceData/"


# 0.2) Load Libraries -------------------

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(writexl) # export final Excel files
library(ggplot2) # plotting 


# 0.3) Define Functions ----------------

# Function to only keep relevant summary columns from Part 2
F.summ_screen <- function(screened_set){
  result <- screened_set %>% 
    dplyr::select('author', 'title','year', 'journal',
                  'doi','unique_id', 'screened_abstracts', 'notes')
  return(result)
}


#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# PART 1: Split .bib files into 20% shared & 40% split for relevance -------------

#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #


# 1.1) Load in bib files and prep initial % vars -------------------------------

tfl_all <- read_bibliography(paste0(dir.deriveddata,"wos_tfl_bib.bib"))
mc_all <-read_bibliography(paste0(dir.deriveddata,"wos_mc_bib.bib")) # from WoS only works for BibTex


# create variables with how many papers to select to get 20% and 40%
n_mc_20percent <- round(nrow(mc_all)*0.2)
n_tfl_20percent <- round(nrow(tfl_all)*0.2)

n_mc_40percent <- round(nrow(mc_all)*0.4)
n_tfl_40percent <- round(nrow(tfl_all)*0.4)

# 1.2) Subset Papers -----------------------------------------


# 1.2.1) Subset Metacoupling ----------------

# randomly select 20% of the MC papers 
mc_20 <- sample_n(mc_all, n_mc_20percent) 

# give us the remaining 80% of the papers
mc_rest <- anti_join(mc_all, mc_20)

# randomly sample half of the remaining 80%, or 40% of total
mc_40_nm <- sample_n(mc_rest, n_mc_40percent)

# give the other reviewer the other half of the remaining
mc_40_yl <- anti_join(mc_rest, mc_40_nm)


# 1.2.2) Subset Tobler's First Law ----------

# sample 20% of TFL papers 
tfl_20 <- sample_n(tfl_all, n_tfl_20percent)

# give us the remaining 80% of the papers not used above
tfl_rest <- anti_join(tfl_all, tfl_20)

# randomly sample half of the remaining 80%, or 40% of total
tfl_40_nm <- sample_n(tfl_rest, n_tfl_40percent)

# give the other reviewer the other half of the remaining
tfl_40_yl <- anti_join(tfl_rest, tfl_40_nm)


# 1.3) Export CSV's for review (omitted) ------------------------

# ## NOTE: Commented this section so we don't overwrite our previous split data
# 
# # source 
# write.csv(mc_all, "Code/SourceData/wos_mc.csv")
# write.csv(tfl_all, "Code/SourceData/wos_tfl.csv")
# 
# # for NM
# write.csv(mc_20, "DerivedData/prescreen/mc_20p_nm.csv")
# write.csv(tfl_20, "DerivedData/prescreen/tfl_20p_nm.csv")
# 
# write.csv(mc_40_nm, "DerivedData/prescreen/mc_40p_nm.csv")
# write.csv(tfl_40_nm, "DerivedData/prescreen/tfl_40p_nm.csv")
# 
# # for YL
# write.csv(mc_20, "DerivedData/prescreen/mc_20p_yl.csv")
# write.csv(tfl_20, "DerivedData/prescreen/tfl_20p_yl.csv")
# 
# write.csv(mc_40_yl, "DerivedData/prescreen/mc_40p_yl.csv")
# write.csv(tfl_40_yl, "DerivedData/prescreen/tfl_40p_yl.csv")


#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# PART 2: Screen for relevance & export summary CSV results -----------------------

#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# 2.1) NM relevance screening ---------------------------------------------

# 2.1.1) Import CSV's ---------------------------------------------

pre_mc_20p <- read.csv(paste0(dir.deriveddata, "prescreen/mc_20p_nm.csv"))
pre_mc_40p <- read.csv(paste0(dir.deriveddata, "prescreen/mc_40p_nm.csv"))

pre_tfl_20p <- read.csv(paste0(dir.deriveddata, "prescreen/tfl_20p_nm.csv"))
pre_tfl_40p <- read.csv(paste0(dir.deriveddata, "prescreen/tfl_40p_nm.csv"))

# 2.1.2) Screen Titles and Abstracts & Save to screen_titleabs ----
# NOTE: commented so the GUI doesn't open 

# # screen the shared 20% mc file
# screen_abstracts(pre_mc_20p)
# 
# # screen the 40% mc I'm responsible for 
# screen_abstracts(pre_mc_40p)
# 
# #screen the shared 20% tfl file 
# screen_abstracts(pre_tfl_20p)
# 
# # screen the 40% mc I'm responsible for 
# screen_abstracts(pre_tfl_40p)


# 2.1.3) Import Result CSVs, Summarize, and Export as new CSV -------

# import the metacoupling 20% result CSV, summarize, and write a summary CSV
scrn_mc_20p <- read.csv("DerivedData/screen_titleabs/screened_mc_20p_nm.csv")
results_mc_20p_nm <- F.summ_screen(scrn_mc_20p)
#write.csv(results_mc_20p_nm, "DerivedData/screen_titleabs_summ/screened_summ_mc_20p_nm.csv")

# import the metacoupling 40% result CSV, summarize, and write a summary CSV
scrn_mc_40p <- read.csv("DerivedData/screen_titleabs/screened_mc_40p_nm.csv")
results_mc_40p_nm <- F.summ_screen(scrn_mc_40p)
#write.csv(results_mc_40p_nm, "DerivedData/screen_titleabs_summ/screened_summ_mc_40p_nm.csv")

# import the tobler's first law 20% result CSV, summarize, and write a summary CSV
scrn_tfl_20p <- read.csv("DerivedData/screen_titleabs/screened_tfl_20p_nm.csv")
results_tfl_20p_nm <- F.summ_screen(scrn_tfl_20p)
#write.csv(results_tfl_20p_nm, "DerivedData/screen_titleabs_summ/screened_summ_tfl_20p_nm.csv")

# import the tobler's first law 40% result CSV, summarize, and write a summary CSV
scrn_tfl_40p <- read.csv("DerivedData/screen_titleabs/screened_tfl_40p_nm.csv")
results_tfl_40p_nm <- F.summ_screen(scrn_tfl_40p)
#write.csv(results_tfl_40p_nm, "DerivedData/screen_titleabs_summ/screened_summ_tfl_40p_nm.csv")


#    #    #

# 2.2) YL relevance screening ---------------------------------------------

# 2.2.1) Import CSV's ------------------------------------

pre_mc_20p <- read.csv(paste0(dir.deriveddata, "prescreen/mc_20p_yl.csv"))
pre_mc_40p <- read.csv(paste0(dir.deriveddata, "prescreen/mc_40p_yl.csv"))

pre_tfl_20p <- read.csv(paste0(dir.deriveddata, "prescreen/tfl_20p_yl.csv"))
pre_tfl_40p <- read.csv(paste0(dir.deriveddata, "prescreen/tfl_40p_yl.csv"))

# 2.2.2) Screen Titles and Abstracts ----------------------

# # screen the shared 20% mc file
# screen_abstracts(pre_mc_20p)
# 
# # screen the 40% mc I'm responsible for 
# screen_abstracts(pre_mc_40p)
# 
# #screen the shared 20% tfl file 
# screen_abstracts(pre_tfl_20p)
# 
# # screen the 40% mc I'm responsible for 
# screen_abstracts(pre_tfl_40p)


# 2.2.3) Import Result CSVs, Summarize, Export as new CSV ----

csv_ls <- list.files(path = "./data/screen_titleabs", pattern = ".*_done_yl\\.csv$", full.names = T)
csv_ls

for (csv in csv_ls) {
  
  scrn <- read.csv(csv)
  scrn_summ <- F.summ_screen(scrn) 
  filename <- gsub("screen_titleabs", "screen_titleabs_summ", csv) %>%
    gsub('pre_', 'screened_summ_', .) %>%
    gsub('done_', '', .)
  
  #write.csv(scrn_summ, file = filename)
  
}


#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# PART 3: Compare the shared 20% results  ----------------------------------------

#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

#3.1) Load in CSV's --------------------------------
mc_20p_yl <- read_csv("data/screen_titleabs_summ/screened_summ_mc_20p_yl.csv")
mc_20p_nm <- read_csv("data/screen_titleabs_summ/screened_summ_mc_20p_nm.csv")

tfl_20p_yl <- read_csv("data/screen_titleabs_summ/screened_summ_tfl_20p_yl.csv")
tfl_20p_nm <- read_csv("data/screen_titleabs_summ/screened_summ_tfl_20p_nm.csv")


# 3.1.0) (omit) Rename the screenedabstracts column with author initials -----
# names(mc_20p_yl)[names(mc_20p_yl) == 'screened_abstracts'] <- 'screened_abstracts_yl'
# names(mc_20p_nm)[names(mc_20p_nm) == 'screened_abstracts'] <- 'screened_abstracts_nm'
# 
# names(tfl_20p_yl)[names(tfl_20p_yl) == 'screened_abstracts'] <- 'screened_abstracts_yl'
# names(tfl_20p_nm)[names(tfl_20p_nm) == 'screened_abstracts'] <- 'screened_abstracts_nm'




# 3.2) Compare Results ------------------------------

# 3.2.1) compare mc results --------------------
mc_20p <- merge(
  x = mc_20p_nm,
  y = mc_20p_yl,
  by = !names(mc_20p_nm) %in% c("screened_abstracts", "notes")
) %>%
  dplyr::mutate(same = ifelse(`screened_abstracts.x` == `screened_abstracts.y`, 1, 0)) 

mc_20p_alignment <- mc_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(mc_20p_nm))

mc_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for MC') ## 94.55%


# 3.2.2) compare tfl results --------------------
tfl_20p <- merge(
  x = tfl_20p_nm,
  y = tfl_20p_yl,
  by = !names(tfl_20p_nm) %in% c("screened_abstracts", "notes")
) %>%
  dplyr::mutate(same = ifelse(`screened_abstracts.x` == `screened_abstracts.y`, 1, 0)) 

tfl_20p_alignment <- tfl_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(tfl_20p_nm))

tfl_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for TFL') ## 94.44%


# 3.3) Keep only selected papers ------------------------------

# update TFL decisions after 5/11/2022 meeting 
tfl_20p_updated <- tfl_20p
tfl_20p_updated$screened_abstracts.x[tfl_20p_updated$unique_id == "WOS:000407512100001"] <- "selected"

# update MC decisions after 5/11/2022 meeting 
mc_20p_updated <- mc_20p
mc_20p_updated$screened_abstracts.y[mc_20p_updated$unique_id == "WOS:000641612900001" | 
                                      mc_20p_updated$unique_id == "WOS:000332185100002" | 
                                      mc_20p_updated$unique_id == "WOS:000749000800004"] <- "selected"

#write_csv(mc_20p_updated, "data/screen_titleabs_summ/screened_summ_mc_20p_both.csv")
#write_csv(tfl_20p_updated, "data/screen_titleabs_summ/screened_summ_tfl_20p_both.csv")


#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# PART 4: Merge and create all relevant entries dataset  --------------------------

#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #


# 4.1) Import screened CSV's ------------------------------------------

# shared 20%
mc_20p <- read_csv('data/screen_titleabs_summ/screened_summ_mc_20p_both.csv') 
tfl_20p <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_20p_both.csv') 

# MC
mc_40p_nm <- read_csv('data/screen_titleabs_summ/screened_summ_mc_40p_nm.csv') 
mc_40p_nm <- mc_40p_nm[mc_40p_nm$screened_abstracts == "selected",]

mc_40p_yl <- read_csv('data/screen_titleabs_summ/screened_summ_mc_40p_yl.csv')
mc_40p_yl <- mc_40p_yl[mc_40p_yl$screened_abstracts == "selected",]


sel_mc_80p <- rbind(mc_40p_nm, mc_40p_yl)


# TFL
tfl_40p_nm <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_40p_nm.csv')
tfl_40p_nm <- tfl_40p_nm[tfl_40p_nm$screened_abstracts == "selected",]


tfl_40p_yl  <- read_csv('data/screen_titleabs_summ/screened_summ_tfl_40p_yl.csv')
tfl_40p_yl <- tfl_40p_yl[tfl_40p_yl$screened_abstracts == "selected",]


sel_tfl_80p <- rbind(tfl_40p_nm, tfl_40p_yl)


# 4.2) Create one df for MC and for TFL by selecting then merging --------

# keep only selected values
sel_mc_20p <- mc_20p[mc_20p$screened_abstracts.x == "selected",]
sel_tfl_20p <- tfl_20p[tfl_20p$screened_abstracts.x == "selected",]

# drop all columns that aren't the same from before
sel_mc_20p <- sel_mc_20p[1:9]
names(sel_mc_20p) <- names(sel_mc_80p)

sel_tfl_20p <- sel_tfl_20p[1:9]
names(sel_tfl_20p) <- names(sel_tfl_80p)

# merge with the 80p from above
sel_mc_100p <- rbind(sel_mc_80p, sel_mc_20p)
sel_tfl_100p <- rbind(sel_tfl_80p, sel_tfl_20p)

# 4.2.1) Add back in the abstracts from the source bibs ----------

# import source csv's
mc_source <- read_csv("Code/SourceData/wos_mc.csv")
mc_source <- select(mc_source, c("unique_id", "abstract"))

tfl_source <- read_csv("Code/SourceData/wos_tfl.csv")
tfl_source <- select(tfl_source, c("unique_id", "abstract"))

# add abstract column to selected papers
sel_mc_100p <- left_join(sel_mc_100p, mc_source, by = "unique_id")
sel_tfl_100p <- left_join(sel_tfl_100p, tfl_source, by = "unique_id")

# 4.3) Export final relevance CSV's ---------------------------------------
#write_csv(sel_mc_100p, "data/screen_titleabs_summ/mc_selected_full.csv")
#write_csv(sel_tfl_100p, "data/screen_titleabs_summ/tfl_selected_full.csv")



#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# PART 5: Summarize & plot subtopics ----------------------------------------------

#  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #  #

# 5.1) Load .bib files ----------------------------------------------------
dir.st<- "Code/SourceData/subtopics/"
dir.scr.st <- "Code/DerivedData/scr_subtopics/"

# Load TFL bibs
tfl_raw <- read_csv("Code/SourceData/wos_tfl.csv")

tfl_landch <- read_bibliography(paste0(dir.st,"tfl_landchange.bib"))
tfl_migr <- read_bibliography(paste0(dir.st,"tfl_migration.bib"))
tfl_tour <- read_bibliography(paste0(dir.st,"tfl_tour.bib"))
tfl_trade <- read_bibliography(paste0(dir.st,"tfl_trade.bib")) #need to double-check this
tfl_agdev <- read_bibliography(paste0(dir.st,"tfl_agdev.bib"))
tfl_cons <- read_bibliography(paste0(dir.st,"tfl_conservation.bib"))
tfl_gov <- read_bibliography(paste0(dir.st,"tfl_gov.bib"))

# Load MC bibs
mc_raw <- read_csv("Code/SourceData/wos_mc.csv")

mc_landch <- read_bibliography(paste0(dir.st,"mc_landchange.bib"))
mc_migr <- read_bibliography(paste0(dir.st,"mc_migration.bib"))
mc_tour <- read_bibliography(paste0(dir.st,"mc_tour.bib"))
mc_trade <- read_bibliography(paste0(dir.st,"mc_trade.bib"))
mc_agdev <- read_bibliography(paste0(dir.st,"mc_agdev.bib"))
mc_cons <- read_bibliography(paste0(dir.st,"mc_conservation.bib"))
mc_gov <- read_bibliography(paste0(dir.st,"mc_gov.bib"))


# 5.1.1) Add "topic" & "citations" columns to subtopics --------

# Load raw and screened mc and tfl files and add a "topic" column
tfl_100p <-  read_csv("data/screen_titleabs_summ/tfl_selected_full.csv")
tfl_100p$topic <- "TFL"

mc_100p <- read_csv("data/screen_titleabs_summ/mc_selected_full.csv")
mc_100p$topic <- "MC"


# add citations from a different raw csv

# import raw CSV's w citations
mc_raw_wcit <- read_bibliography("Code/SourceData/wos_mc_wcitations.bib")
mc_raw_wcit <- select(mc_raw_wcit, c("unique_id", "times_cited"))
mc_raw_wcit$times_cited <- as.integer(mc_raw_wcit$times_cited)

tfl_raw_wcit <- read_bibliography("Code/SourceData/wos_tfl_wcitations.bib")
tfl_raw_wcit <- select(tfl_raw_wcit, c("unique_id", "times_cited"))
tfl_raw_wcit$times_cited <- as.integer(tfl_raw_wcit$times_cited)


# add citations column to _100p variables 
tfl_100p <- merge(tfl_100p, tfl_raw_wcit, by = 'unique_id')
mc_100p <- merge(mc_100p, mc_raw_wcit, by = 'unique_id')


# 5.2) Create screened subtopics by joining ----------------------------

# Join & add subtopic column

# TFL
scr_tfl_landch <- semi_join(tfl_100p, tfl_landch, by = 'unique_id')
scr_tfl_landch$st <- "Land Change"
scr_tfl_migr <- semi_join(tfl_100p, tfl_migr, by = 'unique_id')
scr_tfl_migr$st <- "Species Migration"
scr_tfl_tour <- semi_join(tfl_100p, tfl_tour, by = 'unique_id')
scr_tfl_tour$st <- "Tourism"
scr_tfl_trade <- semi_join(tfl_100p, tfl_trade, by = 'unique_id')
scr_tfl_trade$st <- "Trade"
scr_tfl_agdev <- semi_join(tfl_100p, tfl_agdev, by = 'unique_id')
scr_tfl_agdev$st <- "Ag. Development"
scr_tfl_cons <- semi_join(tfl_100p, tfl_cons, by = 'unique_id')
scr_tfl_cons$st <- "Conservation"
scr_tfl_gov <- semi_join(tfl_100p, tfl_gov, by = 'unique_id')
scr_tfl_gov$st <- "Governance"

scr_tfl_all <- rbind(scr_tfl_landch, scr_tfl_migr, scr_tfl_tour,scr_tfl_trade,
                     scr_tfl_agdev, scr_tfl_cons, scr_tfl_gov)

# MC
scr_mc_landch <- semi_join(mc_100p, mc_landch, by = 'unique_id')
scr_mc_landch$st <- "Land Change"
scr_mc_migr <- semi_join(mc_100p, mc_migr, by = 'unique_id')
scr_mc_migr$st <- "Species Migration"
scr_mc_tour <- semi_join(mc_100p, mc_tour, by = 'unique_id')
scr_mc_tour$st <- "Tourism"
scr_mc_trade <- semi_join(mc_100p, mc_trade, by = 'unique_id')
scr_mc_trade$st <- "Trade"
scr_mc_agdev <- semi_join(mc_100p, mc_agdev, by = 'unique_id')
scr_mc_agdev$st <- "Ag. Development"
scr_mc_cons <- semi_join(mc_100p, mc_cons, by = 'unique_id')
scr_mc_cons$st <- "Conservation"
scr_mc_gov <- semi_join(mc_100p, mc_gov, by = 'unique_id')
scr_mc_gov$st <- "Governance"

scr_mc_all <- rbind(scr_mc_landch, scr_mc_migr, scr_mc_tour,scr_mc_trade,
                    scr_mc_agdev, scr_mc_cons, scr_mc_gov)

# merge all the results into one df
scr_mctfl_all <- rbind(scr_mc_all, scr_tfl_all)



# 5.3) Count Results & Plot ---------------------------------------

# Count results per group & calculate percentages
st_count <- scr_mctfl_all %>% group_by(topic, st) %>% summarise(count_st = n())
st_count <- st_count %>% mutate(perc = if_else(topic == "MC", (count_st/605)*100, (count_st/39)*100))

# manually label x axis, not ideal
#x_labs <- c("Ag. Development", "Conservation", "Governance", "Land Change", "Sp. Migration", "Tourism", "Trade")

# 5.3.1) Plot Count & Percentage --------------------------

# Manually re-order x-axis levels
st_count$st <- factor(st_count$st, 
                      levels = c("Conservation", "Land Change",
                                 "Ag. Development", "Governance",  "Trade", 
                                 "Tourism", "Species Migration"))

# plot count of papers 
(plot_count <- ggplot(st_count,
                      aes(x = st, y = count_st, fill = topic))+
    geom_bar(stat = "identity", position = "dodge")+
    theme_minimal()+
    #scale_x_discrete(labels = x_labs)+ # manually order x labels (not ideal)
    labs(
      x = "Sustainability Subtopic",
      y = "Number of Papers",
      title = "Number of Publications Across Sustainability",
      subtitle = "Metacoupling (MC) and Tobler's First Law (TFL)"
    )+  
    theme(legend.title = element_blank(),
          legend.text = element_text(size = 12),
          legend.position = "top",
          legend.justification = "right",
          legend.margin = margin(t=-25),
          
          axis.text = element_text(size = 11),
          axis.title = element_text(size = 12, face = "italic"),
          #axis.title.x = element_blank(),
          
          plot.title = element_text(size = 15),
          plot.subtitle = element_text(size = 13)
    ))

# plot percentages 
(plot_perc <- ggplot(st_count,
                     aes(x = st, y = perc, fill = topic))+
    geom_bar(stat = "identity", position = "dodge")+
    theme_minimal()+
    #scale_x_discrete(labels = x_labs)+
    labs(
      x = "Sustainability Subtopic",
      y = "Percent of Total Papers",
      title = "Percent of Total Papers per Topic",
      subtitle = "Metacoupling (MC) and Tobler's First Law (TFL)"
    )+  
    theme(legend.title = element_blank(),
          legend.text = element_text(size = 12),
          legend.position = "top",
          legend.justification = "right",
          legend.margin = margin(t=-25),
          
          axis.text = element_text(size = 11),
          axis.title = element_text(size = 12, face = "italic"),
          #axis.title.x = element_blank(),
          
          plot.title = element_text(size = 15),
          plot.subtitle = element_text(size = 13))
)



# 5.4) Exporting & Saving CSV's, XLSX's, & Plots --------------------


# 5.4.1) Export Summarized & Screened CSV's --------------------------

# # TFL
# write_csv(scr_tfl_landch, paste0(dir.scr.st, "scr_tfl_landch.csv"))
# write_csv(scr_tfl_migr, paste0(dir.scr.st, "scr_tfl_migr.csv"))
# write_csv(scr_tfl_tour, paste0(dir.scr.st, "scr_tfl_tour.csv"))
# write_csv(scr_tfl_trade, paste0(dir.scr.st, "scr_tfl_trade.csv"))
# write_csv(scr_tfl_agdev, paste0(dir.scr.st, "scr_tfl_agdev.csv"))
# write_csv(scr_tfl_cons, paste0(dir.scr.st, "scr_tfl_cons.csv"))
# write_csv(scr_tfl_gov, paste0(dir.scr.st, "scr_tfl_gov.csv"))
# 
# # MC
# write_csv(scr_mc_landch, paste0(dir.scr.st, "scr_mc_landch.csv"))
# write_csv(scr_mc_migr, paste0(dir.scr.st, "scr_mc_migr.csv"))
# write_csv(scr_mc_tour, paste0(dir.scr.st, "scr_mc_tour.csv"))
# write_csv(scr_mc_trade, paste0(dir.scr.st, "scr_mc_trade.csv"))
# write_csv(scr_mc_agdev, paste0(dir.scr.st, "scr_mc_agdev.csv"))
# write_csv(scr_mc_cons, paste0(dir.scr.st, "scr_mc_cons.csv"))
# write_csv(scr_mc_gov, paste0(dir.scr.st, "scr_mc_gov.csv"))


# 5.4.2) Export Summarized & Screened Entries ---------------------- 

## NOTE: NM uploaded the XLSX and CSV's to the Drive, at 
## the path code/DerivedData/scr_subtopic

# Export Excluded Entries as CSV's 
mc_excluded <- anti_join(mc_raw, mc_100p, by = 'unique_id')
tfl_excluded <- anti_join(tfl_raw, tfl_100p, by = 'unique_id')

# write_csv(mc_excluded, paste0(dir.scr.st, "mc_excluded.csv"))
# write_csv(tfl_excluded, paste0(dir.scr.st, "tfl_excluded.csv"))

# Export TFL & MC Screened Entries as XLSX with tabs for each topic
xl_tfl <- list("tfl_lc" = scr_tfl_landch, "tfl_migr" = scr_tfl_migr, 
               "tfl_tour" = scr_tfl_tour, #"tfl_trade" = scr_tfl_trade, 
               "tfl_agdev" = scr_tfl_agdev, "tfl_cons" = scr_tfl_cons,
               "tfl_gov" = scr_tfl_gov)
#write_xlsx(xl_tfl, paste0(dir.scr.st, "scr_tfl.xlsx"))

xl_mc <- list("mc_lc" = scr_mc_landch, "mc_migr" = scr_mc_migr, 
              "mc_tour" = scr_mc_tour, "mc_trade" = scr_mc_trade, 
              "mc_agdev" = scr_mc_agdev, "mc_cons" = scr_mc_cons,
              "mc_gov" = scr_mc_gov)
#write_xlsx(xl_mc, paste0(dir.scr.st, "scr_mc.xlsx"))

# 5.4.3) Save Barplots --------------
ggsave(plot = plot_count, "data/figures/mctfl_pubcount.png", dpi = 300)
ggsave(plot = plot_perc, "data/figures/mctfl_pubperc.png", dpi = 300)



# Title: screening_titleabs_nm
# Author: Nick Manning
# Date: 4/23/2022
# Purpose: Import split search results and screen their abstracts and titles using revtools

# Requires:
## prescreened CSV's split to each of our assigned papers (from sample_manually.R)

# Results in: 
## initial screening CSV's (in /screen_titleabs_summ) split between people used
## in final_screened_results.R and cmpare_results_20p.R

# Steps:
# 1) Load Libraries
# 2) Import CSV's
# 3) Screen Titles and Abstracts, Summarize, and Export as CSV

###################################

# NOTES:

## The CSV result is saved externally from the script

## You name the result CSV from within the GUI after screening, I used the format "screened_mc_20p_nm.csv"

## Our result CSV has two new columns, "screened_abstract" (either selected or excluded) and "notes"


# 1) Load Libraries & Functions -----------------------------------------------

rm(list=ls())
library(revtools)
library(dplyr) #for the 'select' fxn

# Create a function to only keep relevant summary columns
summ_screen <- function(screened_set){
  result <- select(screened_set, 'author', 'title','year', 'journal',
                   'doi','unique_id', 'screened_abstracts', 'notes')
  return(result)
}

# 2) Import CSV's -------------------------------------------------------------

getwd()

pre_mc_20p <- read.csv("Code/DerivedData/prescreen/mc_20p_nm.csv")
pre_mc_40p <- read.csv("Code/DerivedData/prescreen/mc_40p_nm.csv")

pre_tfl_20p <- read.csv("Code/DerivedData/prescreen/tfl_20p_nm.csv")
pre_tfl_40p <- read.csv("Code/DerivedData/prescreen/tfl_40p_nm.csv")

# 3) Screen Titles and Abstracts & Save to screen_titleabs ---------------------

# screen the shared 20% mc file
screen_abstracts(pre_mc_20p)

# screen the 40% mc I'm responsible for 
screen_abstracts(pre_mc_40p)

#screen the shared 20% tfl file 
screen_abstracts(pre_tfl_20p)

# screen the 40% mc I'm responsible for 
screen_abstracts(pre_tfl_40p)


# 4) Import Result CSVs, Summarize, and Export as new CSV --------------------

# import the metacoupling 20% result CSV, summarize, and write a summary CSV
scrn_mc_20p <- read.csv("Code/DerivedData/screen_titleabs/screened_mc_20p_nm.csv")
results_mc_20p_nm <- summ_screen(scrn_mc_20p)
write.csv(results_mc_20p_nm, "Code/DerivedData/screen_titleabs_summ/screened_summ_mc_20p_nm.csv")

# import the metacoupling 40% result CSV, summarize, and write a summary CSV
scrn_mc_40p <- read.csv("Code/DerivedData/screen_titleabs/screened_mc_40p_nm.csv")
results_mc_40p_nm <- summ_screen(scrn_mc_40p)
write.csv(results_mc_40p_nm, "Code/DerivedData/screen_titleabs_summ/screened_summ_mc_40p_nm.csv")

# import the tobler's first law 20% result CSV, summarize, and write a summary CSV
scrn_tfl_20p <- read.csv("Code/DerivedData/screen_titleabs/screened_tfl_20p_nm.csv")
results_tfl_20p_nm <- summ_screen(scrn_tfl_20p)
write.csv(results_tfl_20p_nm, "Code/DerivedData/screen_titleabs_summ/screened_summ_tfl_20p_nm.csv")

# import the tobler's first law 40% result CSV, summarize, and write a summary CSV
scrn_tfl_40p <- read.csv("Code/DerivedData/screen_titleabs/screened_tfl_40p_nm.csv")
results_tfl_40p_nm <- summ_screen(scrn_tfl_40p)
write.csv(results_tfl_40p_nm, "Code/DerivedData/screen_titleabs_summ/screened_summ_tfl_40p_nm.csv")


################################

### TO-DO: ### 

# RUN A FULL TOPIC MODEL

#screen_topics()


# Title: screening_titleabs_nm
# Author: Nick Manning
# Date: 4/23/2022
# Purpose: Import split search results and screen their abstracts and titles using revtools

# Steps:
# 1) Load Libraries
# 2) Import CSV's
# 3) Screen Titles and Abstracts, Summarize, and Export as CSV

################################### #

# NOTES:

## The CSV result is saved externally from the script

## You name the result CSV from within the GUI after screening, I used the format "screened_mc_20p_nm.csv"

## Our result CSV has two new columns, "screened_abstract" (either selected or excluded) and "notes"


# 1) Load Libraries & Functions -----------------------------------------------

rm(list=ls())
library(revtools)
library(dplyr) #for the 'select' fxn

# Create a function to only keep relevant summary columns
summ_screen <- function(screened_set){
  result <- screened_set %>% 
    dplyr::select('author', 'title','year', 'journal',
                   'doi','unique_id', 'screened_abstracts', 'notes')
  return(result)
}

# 2) Import CSV's -------------------------------------------------------------

getwd()
dir.data <- "./code/DerivedData/"


pre_mc_20p <- read.csv(paste0(dir.data, "prescreen/mc_20p_yl.csv"))
pre_mc_40p <- read.csv(paste0(dir.data, "prescreen/mc_40p_yl.csv"))

pre_tfl_20p <- read.csv(paste0(dir.data, "prescreen/tfl_20p_yl.csv"))
pre_tfl_40p <- read.csv(paste0(dir.data, "prescreen/tfl_40p_yl.csv"))

# 3) Screen Titles and Abstracts ---------------------------------------------

# screen the shared 20% mc file
screen_abstracts(pre_mc_20p)

# screen the 40% mc I'm responsible for 
screen_abstracts(pre_mc_40p)

#screen the shared 20% tfl file 
screen_abstracts(pre_tfl_20p)

# screen the 40% mc I'm responsible for 
screen_abstracts(pre_tfl_40p)


# 4) Import Result CSVs, Summarize, and Export as new CSV --------------------

csv_ls <- list.files(path = "./data/screen_titleabs", pattern = ".*_done_yl\\.csv$", full.names = T)
csv_ls

for (csv in csv_ls) {
  
  scrn <- read.csv(csv)
  scrn_summ <- summ_screen(scrn) 
  filename <- gsub("screen_titleabs", "screen_titleabs_summ", csv) %>%
    gsub('pre_', 'screened_summ_', .) %>%
    gsub('done_', '', .)
    
  write.csv(scrn_summ, file = filename)
  
}


################################ #

### TO-DO: ### 

# RUN A FULL TOPIC MODEL

#screen_topics()


# Title: screeen_subtopics
# Author: Nick Manning & Yingjie Li
# Date: 5/23/2022
# Purpose: Read in the .bib files for each subtopic and join them 
# with the screened mc and tfl files to essentially screen the subtopics

# Requires:
## raw CSV's of the bibs (from sample_manually.R)
## raw .bib files (from WoS)
## fully screened and merged CSV's (from final_screened_results.R)

# Results in: 
## subtopic CSV's and XLSX's
## figures for count and percent
## excluded entry CSV's
## 100p selected tfl and mc CSV's (in Code/DerivedData)

#############################################################################

# Load Libraries -------------------
rm(list =ls())

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(writexl) # export final Excel files
library(ggplot2) 

# Load .bib files of subtopics ------------------

getwd()
dir <- "Code/SourceData/subtopics/"
dir2 <- "Code/DerivedData/scr_subtopics/"

# Load TFL bibs
tfl_raw <- read_csv("Code/SourceData/wos_tfl.csv")

tfl_landch <- read_bibliography(paste0(dir,"tfl_landchange.bib"))
tfl_migr <- read_bibliography(paste0(dir,"tfl_migration.bib"))
tfl_tour <- read_bibliography(paste0(dir,"tfl_tour.bib"))
tfl_trade <- read_bibliography(paste0(dir,"tfl_trade.bib")) #need to double-check this
tfl_agdev <- read_bibliography(paste0(dir,"tfl_agdev.bib"))
tfl_cons <- read_bibliography(paste0(dir,"tfl_conservation.bib"))
tfl_gov <- read_bibliography(paste0(dir,"tfl_gov.bib"))

# Load MC bibs
mc_raw <- read_csv("Code/SourceData/wos_mc.csv")

mc_landch <- read_bibliography(paste0(dir,"mc_landchange.bib"))
mc_migr <- read_bibliography(paste0(dir,"mc_migration.bib"))
mc_tour <- read_bibliography(paste0(dir,"mc_tour.bib"))
mc_trade <- read_bibliography(paste0(dir,"mc_trade.bib"))
mc_agdev <- read_bibliography(paste0(dir,"mc_agdev.bib"))
mc_cons <- read_bibliography(paste0(dir,"mc_conservation.bib"))
mc_gov <- read_bibliography(paste0(dir,"mc_gov.bib"))


# Load raw and screened mc and tfl files and add a "topic" column
tfl_100p <-  read_csv("data/screen_titleabs_summ/tfl_selected_full.csv")
tfl_100p$topic <- "TFL"

mc_100p <- read_csv("data/screen_titleabs_summ/mc_selected_full.csv")
mc_100p$topic <- "MC"


# add citations from a different raw csv

# import raw CSV's w citations
mc_raw_wcit <- read_bibliography("Code/SourceData/wos_mc_wcitations.bib")
mc_raw_wcit <- select(mc_raw_wcit, c("unique_id", "times_cited"))
mc_raw_wcit$times_cited <- as.integer(mc_raw_wcit$times_cited)

tfl_raw_wcit <- read_bibliography("Code/SourceData/wos_tfl_wcitations.bib")
tfl_raw_wcit <- select(tfl_raw_wcit, c("unique_id", "times_cited"))
tfl_raw_wcit$times_cited <- as.integer(tfl_raw_wcit$times_cited)


# add citations column to _100p variables 
tfl_100p <- merge(tfl_100p, tfl_raw_wcit, by = 'unique_id')
mc_100p <- merge(mc_100p, mc_raw_wcit, by = 'unique_id')

# Export selected entries w citations 

#write_csv(tfl_100p, "Code/DerivedData/relevant_tfl_100p.csv")
#write_csv(mc_100p, "Code/DerivedData/relevant_mc_100p.csv")

# Create screened subtopics by joining ----------------------------

# Join & add subtopic column

# TFL
scr_tfl_landch <- semi_join(tfl_100p, tfl_landch, by = 'unique_id')
scr_tfl_landch$st <- "Land Change"
scr_tfl_migr <- semi_join(tfl_100p, tfl_migr, by = 'unique_id')
scr_tfl_migr$st <- "Species Migration"
scr_tfl_tour <- semi_join(tfl_100p, tfl_tour, by = 'unique_id')
scr_tfl_tour$st <- "Tourism"
scr_tfl_trade <- semi_join(tfl_100p, tfl_trade, by = 'unique_id')
scr_tfl_trade$st <- "Trade"
scr_tfl_agdev <- semi_join(tfl_100p, tfl_agdev, by = 'unique_id')
scr_tfl_agdev$st <- "Ag. Development"
scr_tfl_cons <- semi_join(tfl_100p, tfl_cons, by = 'unique_id')
scr_tfl_cons$st <- "Conservation"
scr_tfl_gov <- semi_join(tfl_100p, tfl_gov, by = 'unique_id')
scr_tfl_gov$st <- "Governance"

scr_tfl_all <- rbind(scr_tfl_landch, scr_tfl_migr, scr_tfl_tour,scr_tfl_trade,
                    scr_tfl_agdev, scr_tfl_cons, scr_tfl_gov)

# MC
scr_mc_landch <- semi_join(mc_100p, mc_landch, by = 'unique_id')
scr_mc_landch$st <- "Land Change"
scr_mc_migr <- semi_join(mc_100p, mc_migr, by = 'unique_id')
scr_mc_migr$st <- "Species Migration"
scr_mc_tour <- semi_join(mc_100p, mc_tour, by = 'unique_id')
scr_mc_tour$st <- "Tourism"
scr_mc_trade <- semi_join(mc_100p, mc_trade, by = 'unique_id')
scr_mc_trade$st <- "Trade"
scr_mc_agdev <- semi_join(mc_100p, mc_agdev, by = 'unique_id')
scr_mc_agdev$st <- "Ag. Development"
scr_mc_cons <- semi_join(mc_100p, mc_cons, by = 'unique_id')
scr_mc_cons$st <- "Conservation"
scr_mc_gov <- semi_join(mc_100p, mc_gov, by = 'unique_id')
scr_mc_gov$st <- "Governance"

scr_mc_all <- rbind(scr_mc_landch, scr_mc_migr, scr_mc_tour,scr_mc_trade,
                    scr_mc_agdev, scr_mc_cons, scr_mc_gov)

# merge all the results into one df
scr_mctfl_all <- rbind(scr_mc_all, scr_tfl_all)

# add subtopic column to relevant papers 
mc_100p_st <- left_join(mc_100p, scr_mc_all[,c("unique_id", "st")])
tfl_100p_st <- left_join(tfl_100p, scr_tfl_all[,c("unique_id", "st")])

# Export selected entries w subtopics 
write_csv(tfl_100p_st, "Code/DerivedData/relevant_tfl_100p_st.csv")
write_csv(mc_100p_st, "Code/DerivedData/relevant_mc_100p_st.csv")



# Count Results & Plot --------------------------

# Count results per group & calculate percentages
st_count <- scr_mctfl_all %>% group_by(topic, st) %>% summarise(count_st = n())
st_count <- st_count %>% mutate(perc = if_else(topic == "MC", (count_st/605)*100, (count_st/39)*100))

# manually label x axis, not ideal
#x_labs <- c("Ag. Development", "Conservation", "Governance", "Land Change", "Sp. Migration", "Tourism", "Trade")

#### Plot Count & Percentage ###

# Manually re-order x-axis levels
st_count$st <- factor(st_count$st, 
                      levels = c("Conservation", "Land Change",
                                 "Ag. Development", "Governance",  "Trade", 
                                 "Tourism", "Species Migration"))

# plot count of papers 
(plot_count <- ggplot(st_count,
       aes(x = st, y = count_st, fill = topic))+
  geom_bar(stat = "identity", position = "dodge")+
  theme_minimal()+
  #scale_x_discrete(labels = x_labs)+ # manually order x labels (not ideal)
  labs(
    x = "Sustainability Subtopic",
    y = "Number of Papers",
    title = "Number of Publications Across Sustainability",
    subtitle = "Metacoupling (MC) and Tobler's First Law (TFL)"
    )+  
  theme(legend.title = element_blank(),
        legend.text = element_text(size = 12),
        legend.position = "top",
        legend.justification = "right",
        legend.margin = margin(t=-25),
        
        axis.text = element_text(size = 11),
        axis.title = element_text(size = 12, face = "italic"),
        #axis.title.x = element_blank(),
        
        plot.title = element_text(size = 15),
        plot.subtitle = element_text(size = 13)
        ))

# plot percentages 
(plot_perc <- ggplot(st_count,
       aes(x = st, y = perc, fill = topic))+
  geom_bar(stat = "identity", position = "dodge")+
  theme_minimal()+
  #scale_x_discrete(labels = x_labs)+
  labs(
    x = "Sustainability Subtopic",
    y = "Percent of Total Papers",
    title = "Percent of Total Papers per Topic",
    subtitle = "Metacoupling (MC) and Tobler's First Law (TFL)"
  )+  
  theme(legend.title = element_blank(),
        legend.text = element_text(size = 12),
        legend.position = "top",
        legend.justification = "right",
        legend.margin = margin(t=-25),
        
        axis.text = element_text(size = 11),
        axis.title = element_text(size = 12, face = "italic"),
        #axis.title.x = element_blank(),
        
        plot.title = element_text(size = 15),
        plot.subtitle = element_text(size = 13))
)



# Export Summarized & Screened CSV's --------------------------

# TFL
write_csv(scr_tfl_landch, paste0(dir2, "scr_tfl_landch.csv"))
write_csv(scr_tfl_migr, paste0(dir2, "scr_tfl_migr.csv"))
write_csv(scr_tfl_tour, paste0(dir2, "scr_tfl_tour.csv"))
write_csv(scr_tfl_trade, paste0(dir2, "scr_tfl_trade.csv"))
write_csv(scr_tfl_agdev, paste0(dir2, "scr_tfl_agdev.csv"))
write_csv(scr_tfl_cons, paste0(dir2, "scr_tfl_cons.csv"))
write_csv(scr_tfl_gov, paste0(dir2, "scr_tfl_gov.csv"))

# MC
write_csv(scr_mc_landch, paste0(dir2, "scr_mc_landch.csv"))
write_csv(scr_mc_migr, paste0(dir2, "scr_mc_migr.csv"))
write_csv(scr_mc_tour, paste0(dir2, "scr_mc_tour.csv"))
write_csv(scr_mc_trade, paste0(dir2, "scr_mc_trade.csv"))
write_csv(scr_mc_agdev, paste0(dir2, "scr_mc_agdev.csv"))
write_csv(scr_mc_cons, paste0(dir2, "scr_mc_cons.csv"))
write_csv(scr_mc_gov, paste0(dir2, "scr_mc_gov.csv"))


# Export Summarized & Screened Entries ---------------------- 


## NOTE: NM uploaded the XLSX and CSV's to the Drive, at 
## the path code/DerivedData/scr_subtopic


# Export Excluded Entries as CSV's 
mc_excluded <- anti_join(mc_raw, mc_100p, by = 'unique_id')
tfl_excluded <- anti_join(tfl_raw, tfl_100p, by = 'unique_id')

write_csv(mc_excluded, paste0(dir2, "mc_excluded.csv"))
write_csv(tfl_excluded, paste0(dir2, "tfl_excluded.csv"))

# Export TFL & MC Screened Entries as XLSX with tabs for each topic
xl_tfl <- list("tfl_lc" = scr_tfl_landch, "tfl_migr" = scr_tfl_migr, 
               "tfl_tour" = scr_tfl_tour, #"tfl_trade" = scr_tfl_trade, 
               "tfl_agdev" = scr_tfl_agdev, "tfl_cons" = scr_tfl_cons,
               "tfl_gov" = scr_tfl_gov)
write_xlsx(xl_tfl, paste0(dir2, "scr_tfl.xlsx"))

xl_mc <- list("mc_lc" = scr_mc_landch, "mc_migr" = scr_mc_migr, 
               "mc_tour" = scr_mc_tour, "mc_trade" = scr_mc_trade, 
               "mc_agdev" = scr_mc_agdev, "mc_cons" = scr_mc_cons,
               "mc_gov" = scr_mc_gov)
write_xlsx(xl_mc, paste0(dir2, "scr_mc.xlsx"))

# Save Barplots --------------
ggsave(plot = plot_count, "data/figures/mctfl_pubcount.png", dpi = 300)
ggsave(plot = plot_perc, "data/figures/mctfl_pubperc.png", dpi = 300)





#############################################################################
############# OLD, use testTFL_screen_testing_laws.R now ####################
#############################################################################



# Title: screen_testing_laws.R  
# Author: Nick Manning & Yingjie Li
# Date: 6/13/2022
# Purpose: Read in the relevant articles CSV and separate it into 
# the _20p_shared files for YL. Screens the 

# Requires:
## fully screened and merged CSV's (from final_screened_results.R)

# Results in: 
## Applied MC / TFL CSVs (in screen_apply)
## 20% YL CSV 

######################## START ##########################################

# 0) Load Libraries and Data ----------------------------------------------------

# 0A) Load Libraries -------------------------------------------
rm(list =ls())

library(revtools) # import bib
library(readr) # read/write csv's
library(dplyr) # inner join
library(ggplot2)
library(stringr) # for splitting the column on if the laws hold or not 
library(tidyr) # for replace_na


# 0B) Load Data from Previous Scripts ----------------------------

# Load Relevant Entries CSV
relevant_mc_100p <- read.csv("Code/DerivedData/relevant_mc_100p.csv")
relevant_tfl_100p <- read_csv("Code/DerivedData/relevant_tfl_100p.csv")


# 1) Prep Data for Testing if MC and TFL Hold ------------------------------------ 

# 1A) Split Data Into NM & YL Sections  ---------------

# calculate number of 20% of entries 
n_mc_20p <- round(nrow(relevant_mc_100p)*0.2)
n_tfl_20p <- round(nrow(relevant_tfl_100p)*0.2)


# split into random 20% for YL
relevant_mc_20p <- as.data.frame(sample_n(relevant_mc_100p, n_mc_20p))
relevant_tfl_20p <- as.data.frame(sample_n(relevant_tfl_100p, n_tfl_20p))


# 1B) Change Data to Fit revtools Requirements ----------------------------------

# change data to be the required format by removing excluded from source
# import source bib and remove excluded data 
raw_mc <- read.csv("Code/SourceData/wos_mc.csv")
full_relv_mc_100p <- semi_join(raw_mc, relevant_mc_100p, "unique_id")
full_relv_mc_20p <- semi_join(raw_mc, relevant_mc_20p, "unique_id")

raw_tfl <- read.csv("Code/SourceData/wos_tfl.csv")
full_relv_tfl_100p <- semi_join(raw_tfl, relevant_tfl_100p, "unique_id")
full_relv_tfl_20p <- semi_join(raw_tfl, relevant_tfl_20p, "unique_id")




# 2) Do the Screening for Testing if Laws Apply (NM 100%, YL 20%) ---------------------

# NOTE: select/exclude/unknown used for TFL. Notes section used for MC (1) and TFL (2)

# NOTE: have to do this in notes section, not select/exclude/unknown section to get two choices

# NOTE: Saved to Code/DerivedData/testing_laws

# NOTE: Testing format: XY where X = MC and Y = TFL, 1st spot = MC, 2nd = TFL

## if MC holds, MC = Y, if fails, MC = N, review/other = R, unsure = U
## if TFL hold, TFL = Y, if fails, TFL = N, review/other = R, unsure = U


# EXAMPLE: In this paper, MC holds, TFL doesn't hold, entry = YN
# EX 2: It's a review paper = RR
# EX 3: MC and TFL both hold = YY

# For NM

#screen_abstracts(full_relv_mc_100p) #done round 1! 6/17/2022, still some U's & X's to revisit
#screen_abstracts()


#screen_abstracts(full_relv_tfl_100p)
#screen_abstracts()

# For YL -- Only uncomment and run the screen_abstracts(var) for initialization, then 
# run screen_abstracts() with the empty parentheses

#screen_abstracts(full_relv_mc_20p)
#screen_abstracts()

#screen_abstracts(full_relv_tfl_20p)
#screen_abstracts()



# 3) Import screened data & merge with relevant entries ---------------------------

# 3A) Import newly screened data ------------------

# import screened data 
testlaws_mc <- read.csv("scr_testlaws_mc100p_nm_done_edits.csv")

testlaws_tfl <- read.csv("scr_testlaws_tfl100p_nm_done_edits.csv")

# change name in law-tested CSV so we can join to screened CSV
names(testlaws_mc)[names(testlaws_mc) == "notes"] <- "applies"
names(testlaws_tfl)[names(testlaws_tfl) == "notes"] <- "applies"


### Remerge into subtopics ### 


# join to relevant df and subset to only screened entries
testlaws_mc <- left_join(relevant_mc_100p, testlaws_mc[,c("unique_id", "applies")])

testlaws_tfl <- left_join(relevant_tfl_100p, testlaws_tfl[,c("unique_id", "applies")])

# 3B) Create subtopic column ---------------------

# import relevancy screened data with subtopic
relevant_mc_100p_st <- read_csv("Code/DerivedData/relevant_mc_100p_st.csv")
relevant_tfl_100p_st <- read_csv("Code/DerivedData/relevant_tfl_100p_st.csv")

# join subtopics to relevancy 
testlaws_mc <- left_join(testlaws_mc, relevant_mc_100p_st[,c("unique_id", "st")])
testlaws_tfl <- left_join(testlaws_tfl, relevant_tfl_100p_st[,c("unique_id", "st")])

# we can see that the index column is different for mc (...1) and tfl (...2)
names(testlaws_mc)
names(testlaws_tfl)

# chnge index column to be the same
names(testlaws_tfl)[names(testlaws_tfl) == "...2"] <- "...1"

# join the testlaw mc and tfl variables  
testlaws_mctfl <- rbind(testlaws_mc, testlaws_tfl)


# 4) Format Data for Plotting  -----------------------------------------------------

# 4A) Rename Subtopics ------------------------------------

testlaws_mctfl <- testlaws_mctfl %>% 
  mutate(st = case_when(
    st == "Ag. Development" ~ "Ag. Dev.",
    st == "Species Migration" ~ "Sp Migr.",
    st == "Conservation" ~ "Cons.",
    st == "Governance" ~ "Gov.",
    st == "Land Change" ~ "LC",
    TRUE ~ st
  ))

# rename all entries that didn't fit a subtopic as Miscellaneous 
testlaws_mctfl$st <- testlaws_mctfl$st %>% replace_na("Misc.")

# test if the new subjects transferred
unique(testlaws_mctfl$st)

#write_csv(testlaws_mctfl, "old_scr_testlaws_st.csv")

# 4B)  Re-format Entry Data ----------------------

### change data to keep only Y, N, and U ###

### NOTE 1: changing the stars to Y's sometimes doesn't work, might need to run again if NA's in final graph ###

# change all stars (*) to Y's 
testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "\\*", "Y")
testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "\\*", "Y") #have to run this line twice for some reason

# change all !'s (prev. "^"s) to N's
testlaws_mctfl$applies <- str_replace(testlaws_mctfl$applies, "!", "N")

# test, should only be combinations of the letters Y, N, U, T, R, X
unique(testlaws_mctfl$applies)


## extract first character of each entry using str_sub ##

# str_sub takes column name, starting position & string length as argument

testlaws_mctfl$applies_mc <- str_sub(string = testlaws_mctfl$applies, start = 1, end = 1) # gets the first char (MC)
testlaws_mctfl$applies_tfl <- str_sub(string = testlaws_mctfl$applies, start = 2, end = 2) # gets the last char (TFL)

# NOTE 2: Convenience problem here, I couldn't get "OR" operator to work so I just did it in steps
# remove all R's 
testlaws_mctfl <- subset(testlaws_mctfl, applies_mc != "R") 
testlaws_mctfl <- subset(testlaws_mctfl, applies_tfl != "R") 

# remove all X's 
testlaws_mctfl <- subset(testlaws_mctfl, applies_mc != "X") 
testlaws_mctfl <- subset(testlaws_mctfl, applies_tfl != "X") 

# test, should only be U (Unsure), Y (Yes), N (No), T (Partly)
unique(testlaws_mctfl$applies_mc)
unique(testlaws_mctfl$applies_tfl)


# 4C) Summarize into Count and Percent ---------------------------

#### NOTE: Could probably make this into a function and clean it up #####

###   MC   ###
testlaws_count_mc <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, st, applies_mc) %>% 
  summarise(count_mc = n()) %>% 
  # then, create the percent column
  mutate(perc_mc = round(count_mc / sum(count_mc)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_mc = case_when(
    applies_mc == "T" ~ "Partly",
    applies_mc == "N" ~ "No",
    applies_mc == "U" ~ "Unsure",
    applies_mc == "Y" ~ "Yes")) 

# reorder the data for plotting  
testlaws_count_mc$applies_mc <- factor(testlaws_count_mc$applies_mc,
                                       levels = c("Unsure", "No", "Partly", "Yes"))

###   TFL   ###
testlaws_count_tfl <- testlaws_mctfl %>% 
  # first, create the count column
  group_by(topic, st, applies_tfl) %>% 
  summarise(count_tfl = n()) %>% 
  # then, create the percent column
  mutate(perc_tfl = round(count_tfl / sum(count_tfl)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_tfl = case_when(
    applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "Y" ~ "Yes")) 

# reorder the data for plotting  
testlaws_count_tfl$applies_tfl <- factor(testlaws_count_tfl$applies_tfl,
                                       levels = c("Unsure", "No", "Partly", "Yes"))

### Both Topics ### 

# Both Topics -- MC #
testlaws_count_mctfl_mc <- testlaws_mctfl %>% 
  group_by(st, applies_mc) %>% 
  summarise(count_mc = n()) %>% 
  # then, create the percent column
  mutate(perc_mc = round(count_mc / sum(count_mc)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_mc = case_when(
    applies_mc == "T" ~ "Partly",
    applies_mc == "N" ~ "No",
    applies_mc == "U" ~ "Unsure",
    applies_mc == "Y" ~ "Yes")) 

# reorder the data for plotting  
testlaws_count_mctfl_mc$applies_mc <- factor(testlaws_count_mctfl_mc$applies_mc,
                                         levels = c("Unsure", "No", "Partly", "Yes"))
# Both Topics -- TFL #
testlaws_count_mctfl_tfl <- testlaws_mctfl %>% 
  group_by(st, applies_tfl) %>% 
  summarise(count_tfl = n()) %>% 
  # then, create the percent column
  mutate(perc_tfl = round(count_tfl / sum(count_tfl)*100, 1)) %>% 
  # then, change the abbreviations to full words
  mutate(applies_tfl = case_when(
    applies_tfl == "T" ~ "Partly",
    applies_tfl == "N" ~ "No",
    applies_tfl == "U" ~ "Unsure",
    applies_tfl == "Y" ~ "Yes"))

# reorder the data for plotting  
testlaws_count_mctfl_tfl$applies_tfl <- factor(testlaws_count_mctfl_tfl$applies_tfl,
                                         levels = c("Unsure", "No", "Partly", "Yes"))

# 5) Plot Data per Topic --------------------------------------------------------

# 5A) Specify Formatting Variables --------------

# Create variable for plotting colors - could switch to a palette later

# used https://colorbrewer2.org/#type=diverging&scheme=PuOr&n=7 for colorblind-freindly
# palette and used plotrix package and color.id fxn to go from hex code to R color name

# Order:            Unsure,       No,       Partly,        Yes
# plot_colors = c("antiquewhite3","tomato","goldenrod1","chartreuse3") 
plot_colors = c("gray97","pink2","lemonchiffon","darkolivegreen3") # colorblind friendly



# 5B) Plot Metacoupling -------------------------

# Does MC Apply?? -- Percent grouped by topic (MC & TFL)

(plot_mcapply_perc <- ggplot(testlaws_count_mc, aes(x = st, y = perc_mc, fill = applies_mc)) + 
  geom_bar(stat = "identity") + 
  facet_wrap(~ topic)+ # wrap into sections by topic
  labs( # add labels to x and y axes and add a title and subtitle
    x = "Sustainability Subtopic",
    y = "Percent of Entries",
    title = "Does Metacoupling Apply Across Sustainability Literature?",
    subtitle = "Percent grouped by MC and TFL"
  )+  
  theme_bw()+ # add a cleaner theme than the gray background default
  scale_fill_manual(values = plot_colors)+ # add a custom color scheme
  theme(
    legend.title = element_blank() # remove legend title 
  )
) #added parentheses so we can create the variable and run the plot in the same section



# 5C) Plot Tobler's First Law -------------------

# Does TFL Apply?? -- Percent grouped by topic (MC & TFL)
plot_tflapply_perc <- ggplot(testlaws_count_tfl, aes(x = st, y = perc_tfl, fill = applies_tfl)) + 
  geom_bar(stat = "identity") + 
  facet_wrap(~ topic)+
  labs(
    x = "Sustainability Subtopic",
    y = "Percent of Entries",
    title = "Does TFL Apply Across Sustainability Literature?",
    subtitle = "Percent grouped by MC and TFL"
  )+
  theme_bw()+
  scale_fill_manual(values = plot_colors)+
  theme(
    legend.title = element_blank()
  )

plot_tflapply_perc

# 5D) Plot if MC and TFL Apply Across Mixed Topics -------------

plot_tflapply_mixedperc <- ggplot(testlaws_count_mctfl_mc, aes(x = st, y = perc_mc, fill = applies_mc)) + 
   geom_bar(stat = "identity") + 
   #facet_wrap(~ topic)+
   labs(
     x = "Sustainability Subtopic",
     y = "Percent of Entries",
     title = "Which Approach is More Appropriate for this Entry?",
     subtitle = "Percent with MC and TFL articles mixed"
   )+
   theme_bw()+
   scale_fill_manual(values = plot_colors)+
   theme(
     legend.title = element_blank()
   )
plot_tflapply_mixedperc




plot_mcapply_mixedperc <- ggplot(testlaws_count_mctfl_tfl, aes(x = st, y = perc_tfl, fill = applies_tfl)) + 
    geom_bar(stat = "identity") + 
    #facet_wrap(~ topic)+
    labs(
      x = "Sustainability Subtopic",
      y = "Percent of Entries",
      title = "Does This Entry Obey TFL?",
      subtitle = "Percent with MC and TFL articles mixed"
    )+
    theme_bw()+
    scale_fill_manual(values = plot_colors)+
    theme(
      legend.title = element_blank()
    )
plot_mcapply_mixedperc




# 5E) Export & Save CSVs and Plots ----------------------

# MC test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_mc_perc.png", plot_mcapply_perc)

# TFL test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_tfl_perc.png", plot_tflapply_perc)

# MC test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_mc_mixedperc.png", plot_mcapply_mixedperc)

# TFL test law percentage plot facet-wrapped by topic
ggsave("data/figures/testlaw_tfl_mixedperc.png", plot_tflapply_mixedperc)


# NOTE: If there is an NA on the TFL plot, refer to 4B Note 1, should be taken care of


############################     END     ############################






# Extra Code: Plots & Whatnots -------------------------------------------------

# Does MC Apply?? -- Percent grouped by subtopic -- 2nd most helpful
# plot_mcapply_perc_st <- ggplot(testlaws_count_mc, aes(x = topic, y = perc_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ st)+
#   theme_bw()+
#   labs(
#     x = "Sustainability Subtopic",
#     y = "Percent of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Percent grouped by subtopic"
#   )+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )


# Does MC Apply?? -- Number grouped by MC and TFL
# ggplot(testlaws_count_mc, aes(x = st, y = count_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ topic)+
#   labs(
#     x = "Sustainability Subtopic",
#     y = "Number of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Number grouped by MC and TFL"
#   )+
#   theme_bw()+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )


# # Does MC Apply?? -- Number grouped by subtopic
# ggplot(testlaws_count_mc, aes(x = topic, y = count_mc, fill = applies_mc)) + 
#   geom_bar(stat = "identity") + 
#   facet_wrap(~ st)+
#   labs(
#     x = "Sustainability Subtopic",
#     y = "Number of Entries",
#     title = "Does Metacoupling Apply Across Sustainability Literature?",
#     subtitle = "Number grouped by subtopic"
#   )+
#   theme_bw()+
#   scale_fill_manual(values = plot_colors)+
#   theme(
#     legend.title = element_blank()
#   )
#   




# title: testTFL_compareYLandNM.R
# Author: Nick Manning & Yingjie Li
# Date: 7/28/2022
# Purpose: Read in YL's 20p testTFL result and compare it
# to NM's 100p result 

# Requires:
## fully screened CSV's from NM (from final_screened_results.R & testTFL_screen_testing_laws.R)
## 20p screened CSV's from YL

# Results in: 
## "Does TFL Apply" and "What framework is more appropriate?" MC / TFL CSVs (in screen_apply)
## Comparison values for YL & NM

#####################################################
rm(list =ls())
library(dplyr)
library(stringr)

###### COMPARE MC's ######

# load in responses from NM (100p) and YL (20p)
mc_100p_nm <- read.csv("scr_testTFL_mc100p_nm.csv")
mc_20p_yl <- read.csv("scr_testTFL_mc20p_YL.csv")

# str_sub takes column name, starting position & string length as argument
mc_20p_yl$TFL_apply <- str_sub(string = mc_20p_yl$notes, start = 1, end = 1) # gets the first char (MC)
#mc_20p_yl$appropriate <- str_sub(string = mc_20p_yl$notes, start = 2) # gets the last char (TFL)

# going to filter mc100p down to mc20p entries based on 'unique_id'
mc_20p_nm <- semi_join(mc_100p_nm, mc_20p_yl, "unique_id")


# keep only relevant columns #notes_og omitted
mc_20p_nm <- select(mc_20p_nm, c("unique_id", "title", "abstract", "TFL_apply", "year"))

mc_20p_yl <- select(mc_20p_yl, c("unique_id", "title", "abstract", "TFL_apply", "year"))

# test accuracy for MC 
mc_20p <- merge(
  x = mc_20p_nm,
  y = mc_20p_yl,
  by = !names(mc_20p_nm) %in% c("TFL_apply")
) %>%
  dplyr::mutate(same = ifelse(`TFL_apply.x` == `TFL_apply.y`, 1, 0))

mc_20p_alignment <- mc_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(mc_20p_nm))

mc_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for MC') ## XX.XX%

### NO UNSURES ###
# merge but remove any unsure values
mc_20p_nou <- merge(
  x = mc_20p_nm,
  y = mc_20p_yl,
  by = !names(mc_20p_nm) %in% c("TFL_apply")
) 

# set all R's to X's
mc_20p_nou$TFL_apply.x[mc_20p_nou$TFL_apply.x == 'R'] <- 'X'
mc_20p_nou$TFL_apply.y[mc_20p_nou$TFL_apply.y == 'R'] <- 'X'

# remove all rows with U's
mc_20p_nou <- subset(mc_20p_nou, TFL_apply.x != "U" & TFL_apply.y != "U") %>%
  dplyr::mutate(same = ifelse(`TFL_apply.x` == `TFL_apply.y`, 1, 0))



# calculate alignment 
mc_20p_nou_alignment <- mc_20p_nou %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(mc_20p_nou))

mc_20p_nou_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat(' Revised Alignment between NM and YL:', ., 'for MC') ## YY.YY


###################################

###### COMPARE TFL'S ######

# load in responses from NM (100p) and YL (20p)
tfl_100p_nm <- read.csv("scr_testTFL_tfl100p_nm.csv")
tfl_20p_yl <- read.csv("scr_testTFL_tfl20p_YL.csv")

# str_sub takes column name, starting position & string length as argument
tfl_20p_yl$TFL_apply <- str_sub(string = tfl_20p_yl$notes, start = 1, end = 1) # gets the first char (TFL)
#tfl_20p_yl$appropriate <- str_sub(string = tfl_20p_yl$notes, start = 2) # gets the last chars (Appr)


# going to filter tfl100p down to tfl20p entries based on 'unique_id'
tfl_20p_nm <- semi_join(tfl_100p_nm, tfl_20p_yl, "unique_id")


# keep only relevant columns #notes_og omitted
tfl_20p_nm <- select(tfl_20p_nm, c("unique_id", "title", "abstract", "TFL_apply", "year"))
tfl_20p_yl <- select(tfl_20p_yl, c("unique_id", "title", "abstract", "TFL_apply", "year"))


### test accuracy for tfl ###

# merge separate tfl's into one 20p 
tfl_20p <- merge(
  x = tfl_20p_nm,
  y = tfl_20p_yl,
  by = !names(tfl_20p_nm) %in% c("TFL_apply")
) %>%
  dplyr::mutate(same = ifelse(`TFL_apply.x` == `TFL_apply.y`, 1, 0))

# calculate alignment 
tfl_20p_alignment <- tfl_20p %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(tfl_20p_nm))

tfl_20p_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Alignment between NM and YL:', ., 'for TFL') ## YY.YY%

### NO UNSURES ###
# merge but remove any unsure values
tfl_20p_nou <- merge(
  x = tfl_20p_nm,
  y = tfl_20p_yl,
  by = !names(tfl_20p_nm) %in% c("TFL_apply")
) 

# remove all rows with U's
tfl_20p_nou <- subset(tfl_20p_nou, TFL_apply.x != "U" & TFL_apply.y != "U") %>%   # | TFL_apply.x != "U"
  dplyr::mutate(same = ifelse(`TFL_apply.x` == `TFL_apply.y`, 1, 0))

# calculate alignment 
tfl_20p_nou_alignment <- tfl_20p_nou %>%
  dplyr::summarise(alignment = sum(same, na.rm = T)/nrow(tfl_20p_nou))

tfl_20p_nou_alignment$alignment %>% 
  scales::percent(accuracy = 0.01) %>% 
  cat('Revised Alignment between NM and YL:', ., 'for TFL') ## YY.YY

