-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBRS.R
139 lines (122 loc) · 5.1 KB
/
BRS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
## BEST SUBSET REGRESSION ##
# Select the best subset of features that satisfy a certain condition. For the aim
# of the jump height democratization through smartphone-IMU measures, it was chosen
# to use the lowest cross-validation error. That is, the best model was the one
# minimizing the cross-validation error.
# ---------------------------------------------------------------------------------
# Author: Guido Mascia, MSc, PhD student at University of Rome "Foro Italico"
# Email: mascia.guido@gmail.com
# First Commit: 09.03.2022
# Last Modified: 09.03.2022
# ---------------------------------------------------------------------------------
## -- References -- ##
# [1] https://rdocumentation.org/packages/leaps/versions/3.1/topics/regsubsets
# [2] https://jslsoc.sitehost.iu.edu/files_research/testing_tests/hccm/00TAS.pdf
# ---------------------------------------------------------------------------------
# Loading required libraries
library(tidyverse)
library(caret)
library(leaps)
library(corrplot)
library(readr)
library(car)
library(tibble)
library(QuantPsyc)
library(jtools)
library(huxtable)
library(officer)
library(ggplot2)
library(broom.mixed)
## -- Initialize ad-hoc functions -- ##
# Model formula initialization
# It stores the regression formula so that it can be easily printed/retreived
get_model_formula <- function(id, object, outcome){
# get models data
models <- summary(object)$which[id,-1]
# Get outcome variable
#form <- as.formula(object$call[[2]])
#outcome <- all.vars(form)[1]
# Get model predictors
predictors <- names(which(models == TRUE))
predictors <- paste(predictors, collapse = "+")
# Build model formula
as.formula(paste0(outcome, "~", predictors))
}
# Cross Validation error function
# Compute the CVerror for eahc fold
get_cv_error <- function(model.formula, data){
set.seed(1)
train.control <- trainControl(method = "cv", number = 10)
cv <- train(model.formula, data = data, method = "lm",
trControl = train.control)
cv$results$RMSE
}
## -- MAIN CODE -- ##
# Load the dataset
# Notice that this is just an example, the dataset could not be shared, i.e.,
# you cannot find the "db.csv" file here.
features <- read_csv("~/db.csv")
# Group the features. This line can be modified in order to exclude some variables
# from some further computation.
vars <- c("y","h","A","b","C","D","e","F","G","H","i","J","k","l","M","n","O","p",
"q","r","s","t","u","W","f3","f2","f1", "Bi", "Tr", "Sc", "Ic")
# Create the database to be analyzed.
df <- features[vars]
# Run Best-Subsets Regression [1]
# var~. : variable to be estimated (dependent)
# data : independent variables dataset
# nvmax : maximum size of the subset to be examine
# nbest : number of subset of each size to store
models <- regsubsets(y~., data = df, nvmax = length(vars), nbest = 1)
summary(models)
# Compute Cross-Validation errors
nmodels <- length(vars) - 1
model.ids <- 1 : nmodels
cv.errors <- map(model.ids, get_model_formula, models, "y") %>%
map(get_cv_error, data = df) %>%
unlist()
round(cv.errors,3)
# Select the model that minimize the CV error
best_model_idx <- which.min(cv.errors)
round(coef(models, best_model_idx),3)
best.formula <- map(best_model_idx, get_model_formula, models, "y")
best_model <- lm(formula = best.formula[[1]], data = df)
summary(best_model)
# Standardized Beta Coefficients
coef_lmbeta <- lm.beta(best_model)
## -- Store Report professionally -- ##
# robust : standard errors robustness [2]
# scale : scale coefficients
# transform.response : get standardized beta coefficients
# confint : confidence intervals
# digits : number of digits
# ci.width : confidence interval width (.5 = 95%)
# vifs : compute Variance Inflation Factors
summ(best_model, robust = "HC3", scale = TRUE, n.sd = 2,
confint = TRUE, digits = 3, ci.width = .05, vifs = TRUE)
model_vif <- lm(y ~ h + A + b + D + e + O + t + f3 + f1 + Tr, data = df)
summ(model_vif, robust = "HC3", scale = TRUE, n.sd = 2,
confint = TRUE, digits = 3, ci.width = .95, vifs = TRUE)
betas <- lm.beta(model_vif)
## -- Check for possible Heteroskedasticity -- ##
# I am going to use the Non-Constant Variance Test test assuming:
# · Null Hypothesis (H0): Homoscedasticity is present (the residuals are
# distributed with equal variance) <--> p >= 0.05
# · Alternative Hypothesis (HA): Heteroscedasticity is present (the residuals
# are not distributed with equal variance) <--> p < 0.05
car::ncvTest(model_vif)
## -- Check if residuals are normally distributed -- ##
# I am going to use the Shapiro-Wilk test, where:
# · Null Hypothesis (H0): Residuals are normally distributed <--> p >= 0.05
# · Alternative Hypothesis (HA): Residuals are not normally distributed
# · <--> p < 0.05
# However, I have to "studentize" the residuals prior the S-W test
sresid <- MASS::studres(model_vif)
shapiro.test(sresid)
# Eventually store the data into a csv for further comparison
fitted_vals <- model_vif$fitted.values
vect <- matrix(ncol = 3, nrow = length(fitted_vals))
vect[,1] <- df$y
vect[,2] <- df$h
vect[,3] <- fitted_vals
write.csv(x = vect, file = "~/res.csv")