Biometrics Dataset: contains nine skeletal measurements and twelve girth measurements along with age, weight, height, and gender.
Cancer Dataset: contains data on tumor types and hypermethylation genes
Compliance Dataset: dataset of treatment compliance for a 10-week study on physical activity and cognitive functioning.
OCD Dataset: dataset of a small observational study on OCD. Patients were measured using the YBOCS scale over eight months.
NYC COVID-19 Dataset: US State-level COVID-19 dataset from the New York Times data portal.
# Biometrics dataset
biometrics_raw <- read_excel('Datasets/biometrics.xls', sheet = 2)
# Cancer dataset
cancer <- read_csv('Datasets/cancer_genes.csv', skip = 2) %>%
drop_na(hyper_methylation)
# Compliance dataset
compliance <- read_csv('Datasets/activity_compliance.csv') %>%
mutate(compliant = if_else(is.na(compliant), 0, compliant))
# OCD dataset
ocd_longitudinal <- read_csv('Datasets/ocd_longitudinal.csv')
ocd_demos <- read_csv('Datasets/ocd_demos.csv')
# COVID-19 dataset
covid_raw <- read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
# biometrics dataset manipulation
biometrics_girth <- biometrics_raw %>%
# filter to selected variables - demography and all girth measurements
select(id, height, weight, gender, contains("girth")) %>%
# create new calculated variables
mutate(bmi = round(weight/(height/100)^2,1)) %>% #BMI
mutate(navel_thigh_girth = navel_girth + thigh_girth) %>% # combined navel, thigh girth
# create categorical variable for BMI
mutate(bmi_cat = case_when(bmi < 18.5 ~ "Underweight",
bmi >=18.5 & bmi < 25 ~ "Normal",
bmi >=25 & bmi < 30 ~ "Overweight",
bmi >= 30 ~ "Obese")) %>%
# factor categorical variables
mutate(bmi_fac = factor(bmi_cat, levels = c("Underweight","Normal", "Overweight",
"Obese")))
# compliance dataset summarization of proportion by week:
proportion_by_week <- compliance %>%
group_by(week, compliant) %>%
summarize(n = n()) %>%
mutate(compliance_proportion = n / sum(n),
sd = sd(compliance_proportion),
se = sd/(sqrt(n)),
lower_ci = compliance_proportion - (1.96*se),
upper_ci = compliance_proportion + (1.96*se)) %>%
filter(compliant == 1) %>%
select(week, compliance_proportion, lower_ci, upper_ci)
# print results
kable(proportion_by_week)
week | compliance_proportion | lower_ci | upper_ci |
---|---|---|---|
1 | 0.8000000 | 0.6799750 | 0.9200250 |
2 | 0.7500000 | 0.6466989 | 0.8533011 |
3 | 0.6500000 | 0.5834221 | 0.7165779 |
4 | 0.7500000 | 0.6466989 | 0.8533011 |
5 | 0.5666667 | 0.5349753 | 0.5983580 |
6 | 0.5166667 | 0.5083693 | 0.5249640 |
7 | 0.4666667 | 0.4492056 | 0.4841277 |
8 | 0.6666667 | 0.5936218 | 0.7397116 |
9 | 0.6166667 | 0.5635027 | 0.6698306 |
10 | 0.5166667 | 0.5083693 | 0.5249640 |
# inner join OCD dataset
ocd <- inner_join(ocd_longitudinal, ocd_demos, by = "study_id")
# pivot ocd dataset longer
ocd_long <- ocd %>%
pivot_longer(cols = starts_with("ybocs"),
names_to = "month",
values_to = "ybocs_score",
names_transform = list("month" = function(str){
step1 <- str_remove(str, "ybocs_m") # remove prefix
step2 <- as.numeric(step1) # code month as numeric
return(step2)
})) %>%
# create remission variable
mutate(remission = if_else(ybocs_score <= 15, "1", "0")) %>%
# factor remission variable and study_id for visualization
arrange(study_id) %>%
mutate(study_id = factor(study_id, levels = unique(study_id)),
rem_fac = factor(remission, levels = c(0, 1))) %>%
# create age categorical variable
mutate(age_cat = if_else(age < 18, "minor", "adult"))
# example t-test between ankle diameter measure and gender
# unequal variance
ankle_res <- t.test(ankle_diameter ~ gender, data = biometrics_raw)
# pull results to objects
t_stat <- ankle_res$statistic # t statistic
df <- ankle_res$parameter # degrees of freedom
p_val <- ankle_res$p.value # p-value
ci_lower <- ankle_res$conf.int[1] # 95% confidence interval
ci_upper <- ankle_res$conf.int[2]
# create tibble of results and print
ankle_res_tidy <- tidy(ankle_res)
kable(ankle_res_tidy)
estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
---|---|---|---|---|---|---|---|---|---|
-1.717591 | 13.02654 | 14.74413 | -21.3131 | 0 | 495.6447 | -1.875928 | -1.559254 | Welch Two Sample t-test | two.sided |
# interpret using in-line code
Interpretation: At the 5% level of significance, we reject the null hypothesis that there is no mean difference in ankle diameter between gender groups (t=-21.31, df=495.64, p<0.0001). We are 95% confident that the true mean difference in ankle diameter between females and males lies between -1.88 and -1.56.
# create a function to calculate the test statistic
calculate_ts <- function(df){
summary <- df %>%
group_by(gender) %>%
summarize(mean_ank = mean(ankle_diameter))
female_ank <- summary %>%
filter(gender == "Female") %>%
pull(mean_ank)
male_ank <- summary %>%
filter(gender == "Male") %>%
pull(mean_ank)
difference <- female_ank - male_ank
return(difference)
}
# calculate observed test statistic with function
obs_stat <- calculate_ts(biometrics_raw)
# create a function to perform a single permutation
perform_permutation <- function(df){
permuted <- df %>%
mutate(gender = sample(gender)) # performs a permutation with sample()
return(permuted)
}
# combine functions
get_permutation_ts <- function(df){
permed <- perform_permutation(df)
ts <- calculate_ts(permed)
return(ts)
}
# simulate 2000 test statistics with map function
results <- map_dbl(1:2000, function(x) get_permutation_ts(biometrics_raw))
# put results into a tibble
perm_results <- tibble(sim_stat = results)
# calculate p-value - proportion of simulated test statistics whose absolute value is greater than or equal to the observed test statistic
perm_pval <- perm_results %>%
mutate(abs_val_greater = if_else(abs(sim_stat) >= abs(obs_stat), 1, 0)) %>%
group_by(abs_val_greater) %>%
summarize(n_obs = n()) %>%
mutate(proportion = n_obs/sum(n_obs)) %>%
filter(abs_val_greater == 1) %>%
pull(proportion)
# create histogram comparing permuted test statistics to the observed difference in means
ggplot(data = perm_results) +
geom_histogram(aes(x = sim_stat), fill = "darkgrey", color = "white") +
geom_vline(aes(xintercept = obs_stat), color = "red", linetype = "dashed") +
theme_bw() +
labs(x = "Simulated Test Statistics", y = "Count") +
annotate(geom = "text", x = -0.8, y = 410, label = "Observed Test Statistic = -1.72", color = "red")
Interpretation: At the 5% level of significance, we reject the null hypothesis that there is no mean difference in ankle diameter between gender groups (p<0.0001).
# create binary outcomes overweight and obese for logistic regression
biometrics_log <- biometrics_raw %>%
mutate(bmi = round(weight/(height/100)^2,1)) %>%
mutate(overweight = if_else(bmi >=25, 1, 0),
obese = if_else(bmi >=30, 1, 0))
### COVARIATE EXPLORATION ###
# overweight outcome, thigh girth predictor, adjusted for gender
thigh_overweight <- glm(overweight ~ thigh_girth + gender, family = "binomial",
data = biometrics_log)
# obese outcome, thigh girth predictor, adjusted for gender
thigh_obese <- glm(obese ~ thigh_girth + gender, family = "binomial",
data = biometrics_log)
# overweight outcome, wrist girth predictor, adjusted for gender
wrist_overweight <- glm(overweight ~ wrist_girth + gender, family = "binomial",
data = biometrics_log)
# obese outcome, wrist girth predictor, adjusted for gender
wrist_obese <- glm(obese ~ wrist_girth + gender, family = "binomial",
data = biometrics_log)
# overweight outcome, waist girth predictor, adjusted for gender
waist_overweight <- glm(overweight ~ waist_girth + gender, family = "binomial",
data = biometrics_log)
# obese outcome, waist girth predictor, adjusted for gender
waist_obese <- glm(obese ~ waist_girth + gender, family = "binomial",
data = biometrics_log)
tbl_regression(waist_obese, exponentiate = TRUE)
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
waist_girth | 1.30 | 1.21, 1.43 | <0.001 |
gender | |||
Female | — | — | |
Male | 0.02 | 0.00, 0.11 | <0.001 |
1 OR = Odds Ratio, CI = Confidence Interval |
# exploratory results visualized in FOREST PLOT
Example Interpretation: At the 5% level of significance, we reject the null hypothesis that there is no significant difference in waist girth between those who are obese (BMI>=30) and those who are not obese (p=<0.001). For every 1cm increase in waist girth, the odds of being obese increases by a factor of 1.30 on average, adjusting for gender.
### MULTIVARIATE LOGISTIC MODEL ###
mult_obese <- glm(obese ~ waist_girth + wrist_girth + thigh_girth + gender,
family = "binomial", data = biometrics_log)
tbl_regression(mult_obese, exponentiate = TRUE)
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
waist_girth | 1.31 | 1.19, 1.49 | <0.001 |
wrist_girth | 0.94 | 0.43, 2.04 | 0.9 |
thigh_girth | 1.32 | 1.15, 1.56 | <0.001 |
gender | |||
Female | — | — | |
Male | 0.04 | 0.00, 0.37 | 0.006 |
1 OR = Odds Ratio, CI = Confidence Interval |
# graph of distribution of combined navel and thigh girth variable
ggplot(data = biometrics_girth) +
geom_histogram(aes(x = navel_thigh_girth), color = "white", fill='darkgrey') +
theme_bw() +
labs(title = "Distribution of Combined Navel and Thigh Girth", x = "Navel and Thigh Girth", y = "Count")
# distribution of combined navel and thigh girth variable, divided by gender
ggplot(data = biometrics_girth) +
geom_boxplot(aes(y = navel_thigh_girth, x = gender), fill='darkgrey') +
theme_bw() +
labs(title = "Distribution of Combined Navel and Thigh Girth",
y = "Navel and Thigh Girth", x = "Gender")
# exploratory scatterplot for the relationship between continuous BMI and navel+thigh combined girth variable
ggplot(data = biometrics_girth) +
# scatterplot
geom_point(aes(x = bmi, y = navel_thigh_girth)) +
# linear best fit line
geom_smooth(aes(x = bmi, y = navel_thigh_girth), method = 'lm', se = FALSE,
color = 'red') +
labs(title = "Relationship between BMI and Combined Navel and Thigh Girth",
y = "Navel and Thigh Girth", x = "BMI") +
theme_bw()
# faceted scatterplots of BMI and navel+thigh combined girth by BMI category and gender
ggplot(data = biometrics_girth) +
geom_point(aes(x = bmi, y = navel_thigh_girth)) +
geom_smooth(aes(x = bmi, y = navel_thigh_girth), method = 'lm', se = FALSE,
color = 'red') +
facet_grid(bmi_fac~gender) +
theme_bw() +
labs(title = "Relationship between BMI and Combined Navel and Thigh Girth,", subtitle= "by BMI category and gender", y = "Combined Navel and Thigh Girth", x = "BMI")
# heatmap of tumor types and genes, with color indicating severity of hypermethylation in cancer dataset
ggplot(data = cancer) +
geom_tile(aes(y = tumor_type, x = gene, fill = hyper_methylation)) +
labs(y = "Tumor Type", x = "Gene", fill = "Hyper Methylation", title = "Hyper Methylation Values by Tumor Type and Gene") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_gradient(high = "darkred", low = "white")
# spaghetti plot showing YBOCS scores over time for all individuals in OCD dataset
ggplot(data = ocd_long) +
geom_line(aes(x = month, y = ybocs_score, group = study_id), alpha=0.5) +
theme_bw() +
labs(x = "Month", y = "YBOCS Score", title = "Patient YBOCS Scores by Month") +
scale_x_continuous(breaks = 1:8, labels = scales::label_number(prefix = "Month "))
# plot showing the binary remission variable over time for each individual, factored by minor status
ggplot(data = ocd_long) +
geom_tile(aes(x = month, y = study_id, fill = rem_fac), color = "black") +
labs(x = "Month", y = "Patient ID", title = "Patient Remission Status by Month",
fill = "Remission Status") +
theme(legend.position = "bottom",
panel.background = element_rect(fill="white")) +
scale_x_continuous(breaks = 1:8) +
scale_fill_manual(values = c("grey", "darkblue")) +
facet_wrap(~age_cat, scales = "free")
# visualize exploratory variable Odds Radios from logistic regressions with biometrics dataset
# put ORs and CIs into dataframes, create outcome variable
OR_thigh_overweight <- tidy(thigh_overweight, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "overweight")
OR_thigh_obese <- tidy(thigh_obese, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "obese")
OR_wrist_overweight <- tidy(wrist_overweight, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "overweight")
OR_wrist_obese <- tidy(wrist_obese, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "obese")
OR_waist_overweight <- tidy(waist_overweight, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "overweight")
OR_waist_obese <- tidy(waist_obese, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(outcome = "obese")
# combine OR dataframes to plot
all_ORs <- OR_thigh_overweight %>%
bind_rows(OR_thigh_obese) %>%
bind_rows(OR_wrist_overweight) %>%
bind_rows(OR_wrist_obese) %>%
bind_rows(OR_waist_overweight) %>%
bind_rows(OR_waist_obese) %>%
# filter to only predictors of interest
filter(str_detect(term, "girth")) %>%
# labels for plot
mutate(term_lab = str_remove(term, "_girth")) %>%
mutate(outcome_lab = if_else(outcome == "obese",
"Obese (BMI>=30)", "Overweight (BMI>=25)"))
# create forest plot summary of ORs
ggplot(data = all_ORs) +
geom_point(aes(x = estimate, y = term_lab)) +
geom_errorbar(aes(xmin = conf.low, xmax = conf.high, y = term_lab), width = 0.2) +
# width to change width of error bars
facet_wrap(~outcome_lab, ncol = 1) +
labs(x = "Odds Ratio", y = "Girth (cm)") +
geom_vline(aes(xintercept = 1.0), linetype = "dashed") +
scale_x_continuous(breaks = seq(from=0.0, to=5.0, by=0.5), limits = c(0,5)) +
theme_bw()
# shows the proportion of compliant individuals and 95% confidence interval per week over target compliance rate from compliance dataset
ggplot(data = proportion_by_week) +
geom_pointrange(aes(x = week, y = compliance_proportion, ymax = upper_ci, ymin = lower_ci)) +
geom_line(aes(x = week, y = compliance_proportion), linetype = "dotted") +
geom_hline(aes(yintercept = 0.65), color = "darkblue", linetype = "dashed") +
theme_bw() +
labs(title = "Observed Proportion of Compliant Individuals by Week",
x = "Week", y = "Proportion") +
scale_x_continuous(breaks = 1:10, labels = paste("Week", 1:10)) +
scale_y_continuous(limits = 0:1) +
annotate(geom = "text", x = 8, y = 0.9, label = "Target Compliance = 0.65",
color = "darkblue")
# patchwork plot showing patient trajectories over time, faceted by minor status from OCD dataset
lasagna <- ggplot(data = ocd_long) +
geom_tile(aes(x = month, y = study_id, fill = rem_fac), color = "black") +
labs(x = "Month", y = "Patient ID", title = "Patient Remission Status by Month",
fill = "Remission Status") +
theme(legend.position = "bottom",
panel.background = element_rect(fill="white")) +
scale_x_continuous(breaks = 1:8) +
scale_fill_manual(values = c("grey", "darkblue")) +
facet_wrap(~age_cat, scales = "free")
spaghetti <- ggplot(data = ocd_long) +
geom_line(aes(x = month, y = ybocs_score, group = study_id), alpha=0.5) +
theme_bw() +
labs(x = "Month", y = "YBOCS Score", title = "Patient YBOCS Scores by Month") +
scale_x_continuous(breaks = 1:8, labels = scales::label_number(prefix = "Month ")) +
facet_wrap(~age_cat)
lasagna / spaghetti +
plot_annotation(title = "Patient Trajectories",
theme = theme(plot.title = element_text(size=rel(1.5), face="bold")))
# labeled table presenting mean, standard deviation, minimum, median, maximum, p-value of t-test of continuous variables, split by gender
biometrics_raw %>%
select(gender, contains("girth")) %>%
tbl_summary(by = gender,
statistic = all_continuous() ~ "{mean} ({sd}), {min} {median} {max}",
label = list(shoulder_girth ~ "Shoulder Girth",
chest_girth ~ "Chest Girth",
waist_girth ~ "Waist Girth",
navel_girth ~ "Navel Girth",
hip_girth ~ "Hip Girth",
thigh_girth ~ "Thigh Girth",
bicep_girth_flexed ~ "Flexed Bicep Girth",
forearm_girth ~ "Forearm Girth",
knee_girth ~ "Knee Girth",
calf_girth ~ "Calf Girth",
ankle_girth ~ "Ankle Girth",
wrist_girth ~ "Wrist Girth")) %>%
add_p(test = all_continuous() ~ "t.test",
test.args = all_tests("t.test") ~ list(var.equal = TRUE))
Characteristic | Female, N = 2601 | Male, N = 2471 | p-value2 |
---|---|---|---|
Shoulder Girth | 100 (6), 86 100 130 | 117 (6), 100 116 135 | <0.001 |
Chest Girth | 86 (6), 73 86 109 | 101 (7), 79 101 119 | <0.001 |
Waist Girth | 70 (8), 58 68 102 | 85 (9), 67 83 113 | <0.001 |
Navel Girth | 84 (10), 64 82 121 | 88 (8), 67 87 112 | <0.001 |
Hip Girth | 96 (7), 79 95 128 | 98 (6), 82 97 119 | <0.001 |
Thigh Girth | 57.2 (4.6), 46.3 56.4 75.7 | 56.5 (4.2), 46.8 56.0 70.0 | 0.078 |
Flexed Bicep Girth | 28.1 (2.7), 22.4 27.8 40.3 | 34.4 (3.0), 25.6 34.4 42.4 | <0.001 |
Forearm Girth | 23.76 (1.68), 19.60 23.60 30.80 | 28.24 (1.78), 22.70 28.40 32.50 | <0.001 |
Knee Girth | 35.26 (2.58), 29.00 35.00 49.00 | 37.20 (2.27), 31.10 37.00 45.70 | <0.001 |
Calf Girth | 35.01 (2.61), 28.40 34.85 45.40 | 37.21 (2.65), 28.90 37.30 47.70 | <0.001 |
Ankle Girth | 21.21 (1.44), 17.40 21.10 26.00 | 23.16 (1.73), 16.40 23.00 29.30 | <0.001 |
Wrist Girth | 15.06 (0.85), 13.00 15.00 18.20 | 17.19 (0.91), 14.60 17.10 19.60 | <0.001 |
1 Mean (SD), Minimum Median Maximum | |||
2 Two Sample t-test |
# Obtain 95% confidence interval around a mean using standard SE formula
# Get mean of new COVID-19 deaths in NYC in 2022
ny_covid22 <- covid_raw %>%
# filter to NY
filter(state == "New York") %>%
arrange(date) %>%
# calculate new deaths
mutate(new_deaths = deaths - lag(deaths)) %>%
# filter to dates in year 2022
filter(date >= "2022-01-01")
mean_nd <- ny_covid22 %>%
pull(new_deaths) %>%
mean()
standard_ci <- ny_covid22 %>%
summarize(mean_nd = mean(new_deaths),
total_n = n(),
se = sd(new_deaths)/sqrt(total_n),
lower_ci = mean_nd - 1.96*se,
upper_ci = mean_nd + 1.96*se) %>%
mutate(pretty_ci = str_c(round(mean_nd,2), " (", round(lower_ci,2), ", ", round(upper_ci,2), ")"))
ci_out <- standard_ci %>%
pull(pretty_ci)
The mean number and 95% confidence interval of daily new COVID-19 deaths reported in New York since 01/01/2022 is 47.24 (30, 64.47).
# Obtain 95% confidence interval around a mean using bootstrapping
# create functions
get_mean <- function(df){
mean <- df %>%
pull(new_deaths) %>%
mean()
return(mean)
}
create_bootstrap <- function(df){
boot <- df %>%
slice_sample(prop = 1, replace = TRUE) # pull sample the same size as original
return(boot)
}
# combine functions
mean_bootstrap <- function(df){
boot_m <- create_bootstrap(df) %>%
get_mean()
return(boot_m)
}
# get boostrapped results and put into a tibble
boot_res <- map_dbl(1:2000, function(x) mean_bootstrap(ny_covid22))
res_tibble <- tibble(run = 1:2000, mean = boot_res)
# get boostrapped confidence interval with quantiles
boot_ci <- res_tibble %>%
summarize(lower_ci = quantile(mean, 0.025),
upper_ci = quantile(mean, 0.975))
# plot the distribution of boostrapped data
ggplot(data = res_tibble) +
geom_histogram(aes(x = mean), color = "white", fill = "darkgrey") +
theme_bw() +
labs(title = "Bootstrapped Distribution")
The bootstrapped 95% CI for number of new deaths reported in New York from 01/01/22 onward is 34.533557, 66.3423937.
Using k-fold cross validation, CART modeling, and logistic regression modeling
# Step 1: Split data into training and test sets
biometrics_pred <- biometrics_log %>%
# factor outcome and gender
mutate(gender_fac = factor(gender, levels = c('Female', 'Male')),
wt = if_else(overweight == 0, 'Not_overweight', 'Overweight'),
wt_fac = factor(wt, levels = c('Not_overweight', 'Overweight'))) %>%
# remove variables
select(-id, -obese, -overweight, -age, -weight, -wt, -height, -bmi, -gender)
# check distribution of outcome overweight in total dataset
biometrics_pred %>%
group_by(wt_fac) %>%
summarize(n = n()) %>%
mutate(proportion = n / sum(n))
## # A tibble: 2 × 3
## wt_fac n proportion
## <fct> <int> <dbl>
## 1 Not_overweight 357 0.704
## 2 Overweight 150 0.296
#split data, 75% training
split_data <- initial_split(biometrics_pred, prop = 0.75, strata = wt_fac)
# stratify so proportion of outcome is the same between training/test
train_dat <- training(split_data)
test_dat <- testing(split_data)
# Check that distribution of outcome is similar to total dataset in training and test sets
train_dat %>%
group_by(wt_fac) %>%
summarize(n = n()) %>%
mutate(proportion = n / sum(n))
## # A tibble: 2 × 3
## wt_fac n proportion
## <fct> <int> <dbl>
## 1 Not_overweight 267 0.704
## 2 Overweight 112 0.296
test_dat %>%
group_by(wt_fac) %>%
summarize(n = n()) %>%
mutate(proportion = n / sum(n))
## # A tibble: 2 × 3
## wt_fac n proportion
## <fct> <int> <dbl>
## 1 Not_overweight 90 0.703
## 2 Overweight 38 0.297
The distribution of outcome (overweight) in the training and test data appears equal.
logistic_mod <- train(
form = wt_fac ~ ., # formula
data = train_dat, # training data
trControl = trainControl(method = "cv", # cross-validation
number = 10, # 10-fold
summaryFunction = twoClassSummary, # obtain ROC
classProbs = TRUE, # obtain ROC
savePredictions = "final"), # plot ROC/AUC
metric = "ROC", # use ROC/AUC
method = "glm", # glm function
family = "binomial" # logistic regression
)
The 10-fold cross-validated ROC/AUC (10-fold) of this model on the training dataset is 0.968839.
## tune based on ROC/AUC
class_tree <- train(
form = wt_fac ~ ., # formula
data = train_dat, # training data
method = "rpart", # CART
trControl = trainControl(method = "cv", # cross validation
number = 10, # 10-fold
summaryFunction = twoClassSummary, # obtain ROC
classProbs = TRUE, # obtain ROC
savePredictions = "final"), # save for plotting ROC/AUC
metric = "ROC" # use ROC/AUC
)
# cross-validated model ROC/AUC vs different complexity parameters
plot(class_tree)
The best reported ROC/AUC value to maximize performance is 0.0491071.
# plot final model
rpart.plot(class_tree$finalModel)
# compare cross-validated ROC/AUC of logistic and CART to see which model is best
compare_mods <- resamples(list("Logistic"=logistic_mod,
"CART" = class_tree))
dotplot(compare_mods, metric = "ROC")
The logistic model performs better on the training data with the AUC=0.97 compared to 0.85.
# create a ROC plot and report the AUC for final logistic model on the test set
log_pred_prob <- predict(logistic_mod, newdata = test_dat, type = "prob")
# type = "prob" gives predicted probabilities
# merge datasets
log_test_pred_df <- test_dat %>%
bind_cols(log_pred_prob)
log_test_roc <- roc(wt_fac ~ Overweight, data = log_test_pred_df)
# plot ROC values
ggroc(log_test_roc) +
theme_bw()
The AUC for my final logistic model is 0.9588.
This is a high AUC value and suggests the model performs strongly on the test data.