library(tidyverse)
library(psych)
library(sjPlot)
library(ggthemes)
library(ggplot2)
library(MASS)
library(purrr)
library(broom)
library(lm.beta)
library(RColorBrewer)
library(plotly)
library(glmnet)
library(glmnetUtils)
source("http://www.labape.com.br/rprimi/R/cria_quartis.R")
In multiple regression with \(X_1 ... X_n\) predicting \(Y\) each \(B_i\) or \(\beta_i\) coefficient represents the unique effect of that varible on \(Y\) controling all the other variables in the model. If \(\beta_1\) corresponds to the unique effect of \(X_1\), and \(X_1\) is correlated with \(X_2\) where does sharred effect goes ?
This code simulates draw 3 variables \(Y\), \(X_1\), and \(X_2\) from a multivariate normal random distribution having a specified var_cor
matrix of correlations.
It makes a grid of possible correlations params
and then simulates data using mvrnorm
function.
params <- expand.grid(
r_Y1 = seq(0, .68, by=.05),
r_Y2 = seq(0, .68, by=.05),
r_12= seq(0, .88, by=.05)
)
simulate_3vars <- function (r_Y1, r_Y2, r_12, n=400 ){
var_cor <- matrix(
c(1.00, r_12 , r_Y1 ,
r_12, 1.00, r_Y2,
r_Y1, r_Y2, 1.00),
byrow=TRUE, nrow=3, ncol=3,
dimnames = list(
c("X1", "X2", "Y"),
c("X1", "X2", "Y")
))
df <- try(
mvrnorm(n, mu = c(0, 0, 0),
Sigma = var_cor, empirical = TRUE) %>% as.data.frame(), silent = TRUE)
return(df)
}
simulated_data <- params %>%
mutate(data = pmap(., simulate_3vars)
)
simulated_data <- params %>%
mutate(data = pmap(., simulate_3vars),
cl = map_chr(data, class)
)
simulated_data <- simulated_data %>%
filter(cl == "data.frame") %>%
mutate(
mult_regres = map(data, ~lm(Y ~ X1 + X2, data=.)),
lasso_regres = map(data, ~glmnet(Y ~ X1 + X2, data=. )),
beta = map(mult_regres, ~lm.beta(.)),
beta2 = map(beta, "coefficients"),
glance = map(mult_regres, glance),
)
simulated_data <- simulated_data %>%
mutate(
beta2 = map(beta2, t)
) %>%
mutate(
beta2 = map(beta2, as.data.frame)
)
summar <- simulated_data %>% select(glance) %>% unnest
betas <- simulated_data %>% select(beta2) %>% unnest
df <- simulated_data %>% select(1:3) %>% bind_cols(betas, summar)
names(df)[5:6] <- c("b_x1", "b_x2")
As explained here: https://stats.stackexchange.com/questions/24827/where-is-the-shared-variance-between-all-ivs-in-a-linear-multiple-regression-equ when \(X_1\) and \(X_2\) have the same correlation with \(Y\) as \(X_1\) and \(X_2\) “come closer and closer to being perfectly correlated, their b-values in the multiple regression come closer and closer to HALF of the b-value in the simple linear regression of either one of them. However, as \(X_1\) and \(X_2\) come closer and closer to being perfectly correlated, the STANDARD ERROR of b1 and b2 moves closer and closer to infinity, so the t-values converge on zero. So, the t-values will converge on zero (i.e., no UNIQUE linear relationship between either \(X_1\) and \(Y\) or \(X_2\) and \(Y\)), but the b-values converge to half the value of the b-values in the simple linear regression … so as the correlation between \(X_1\) and \(X_2\) approaches unity, EACH of the partial slope coefficients approaches contributing equally to the prediction of the \(Y\) value, even though neither independent variable offers any UNIQUE explanation of the dependent variable” (HTH // Phil, 2017)
This is show in the simulation
df %>% filter(r_Y2 == .50 & r_Y1 == .50) %>%
ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) +
geom_point(alpha=1/2) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
scale_y_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
scale_x_continuous(breaks= seq(0.20, .55, by=.05), limits = c(.20,.55)) +
geom_vline(xintercept = .5, color = "orange") +
geom_hline(yintercept = .5, color = "orange") +
theme_minimal()
Now look how crazy \(\beta's\) goes when \(r_{yx_1} > r_{yx_2}\) as \(r_{x_{12}}\) approaches 1. Here we are seeing Simpsom’s paradox.
df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
ggplot(aes(y=b_x1, x=b_x2, color = r_12, size=r_12)) +
geom_point(alpha=1/2) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
geom_vline(xintercept = .25, color = "orange") +
geom_hline(yintercept = .50, color = "orange") +
scale_y_continuous(breaks= round(seq(.3, 1, by=.10), 2), limits = c(.3,1)) +
scale_x_continuous(breaks= round(seq(-.6, .40, by=.10), 2), limits = c(-.6,.40)) +
theme_minimal()
Now 3d graph
colors <- brewer.pal(7, "Paired")
df %>% filter(r_Y1 == .50 & r_Y2 == .25 ) %>%
plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
type="scatter3d",
mode="markers", color=~r_12,
colors = colors,
size = 3.5,
opacity = 1/1.5)
colors <- brewer.pal(7, "Paired")
df %>% filter(r_Y1 == .50 & r_Y2 == .50 ) %>%
plot_ly(y=~b_x1, z=~b_x1, x=~r_12,
type="scatter3d",
mode="markers", color=~r_12,
colors = colors,
size = 3.5,
opacity = 1/1.5)
# http://varianceexplained.org/broom-gallery/snippets/broom-glmnet.html
tidy(simulated_data$lasso_regr[[1]])
tidied <- tidy(simulated_data$lasso_regr[[1]]) %>% filter(term != "(Intercept)")
ggplot(tidied, aes(step, estimate, group = term)) + geom_line()
coefficients <- simulated_data$lasso_regr[[1]] %>%
broom::tidy() %>%
mutate(log_lambda = log(lambda)) %>%
filter(term != "(Intercept)")
plot(simulated_data$lasso_regr[[1]], xvar = "lambda", label = TRUE)
ggplot(coefficients, aes(x=log_lambda, y=estimate, col=term)) +
geom_line() + coord_cartesian(xlim=log(coefficients$lambda))
tidied_cv <- tidy(simulated_data$lasso_regr[[1]])
glance_cv <- glance(simulated_data$lasso_regr[[1]])