setwd("~/Dropbox/R Stat")

Análise de variância fatorial

Prática com o senna v1 (two way ANOVA): Há diferenças em auto gestão entre alunos de séries diferentes ? Essa diferença é a mesma entre homes e mulheres? ANOVA 3 X 2 (Escolaridade X Sexo)

# Abrir banco de dados
load("senna.RData")

# Cria escolaridade e sexo como uma variável "factor"
sennav1$esc2 <- factor(sennav1$ESCOLARIDADE)
sennav1$sexo2 <- factor(sennav1$SEXO)

# ANOVA VD: auto gestão VI's: escolaridade e sexo
fit <- aov(F1.Cons ~ esc2*sexo2, data = sennav1)
summary(fit)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## esc2         2 13.239   6.619  12.904 2.18e-05 ***
## sexo2        1  0.066   0.066   0.129   0.7207    
## esc2:sexo2   2  3.290   1.645   3.207   0.0475 *  
## Residuals   60 30.779   0.513                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Comparações post-hoc
TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = F1.Cons ~ esc2 * sexo2, data = sennav1)
## 
## $esc2
##           diff       lwr         upr     p adj
## 7-5 -0.5942266 -1.108546 -0.07990716 0.0197400
## 9-5 -1.1221600 -1.653347 -0.59097316 0.0000118
## 9-7 -0.5279334 -1.042253 -0.01361397 0.0429873
## 
## $sexo2
##           diff        lwr       upr     p adj
## 1-0 0.06220468 -0.2904923 0.4149017 0.7254824
## 
## $`esc2:sexo2`
##                diff         lwr          upr     p adj
## 7:0-5:0 -0.20508351 -1.09407169  0.683904656 0.9836108
## 9:0-5:0 -0.48148148 -1.47540047  0.512437509 0.7112231
## 5:1-5:0  0.66975309 -0.25997300  1.599479170 0.2908978
## 7:1-5:0 -0.22222222 -1.21614121  0.771696769 0.9857579
## 9:1-5:0 -0.93291576 -1.86264184 -0.003189675 0.0487391
## 9:0-7:0 -0.27639797 -1.16538614  0.612590205 0.9411012
## 5:1-7:0  0.87483660  0.05824882  1.691424379 0.0289064
## 7:1-7:0 -0.01713871 -0.90612688  0.871849464 0.9999999
## 9:1-7:0 -0.72783224 -1.54442002  0.088755533 0.1072588
## 5:1-9:0  1.15123457  0.22150848  2.080960651 0.0070750
## 7:1-9:0  0.25925926 -0.73465973  1.253178250 0.9718564
## 9:1-9:0 -0.45143428 -1.38116036  0.478291806 0.7092112
## 7:1-5:1 -0.89197531 -1.82170139  0.037750775 0.0672062
## 9:1-5:1 -1.60266885 -2.46342794 -0.741909750 0.0000128
## 9:1-7:1 -0.71069354 -1.64041962  0.219032547 0.2308620
# Alguns programas em: http://www.personality-project.org/r/r.anova.html
print(model.tables(fit,"means"),digits=3)       
## Tables of means
## Grand mean
##          
## 3.500941 
## 
##  esc2 
##         5     7     9
##      4.07  3.48  2.95
## rep 21.00 24.00 21.00
## 
##  sexo2 
##         0     1
##      3.47  3.53
## rep 33.00 33.00
## 
##  esc2:sexo2 
##      sexo2
## esc2  0     1    
##   5    3.69  4.36
##   rep  9.00 12.00
##   7    3.49  3.47
##   rep 15.00  9.00
##   9    3.21  2.76
##   rep  9.00 12.00
boxplot(F1.Cons ~ esc2*sexo2,data=sennav1 )

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(F1.Cons ~ interaction(esc2, sexo2, sep=" "),
           connect=list(c(1,2,3),c(4,5,6)),
                      data = sennav1)

library(HH)
## Loading required package: lattice
## Loading required package: grid
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
## Loading required package: gridExtra
## 
## Attaching package: 'HH'
## The following object is masked from 'package:gplots':
## 
##     residplot

interaction2wt(F1.Cons ~ esc2*sexo2, data = sennav1)

Prática com o senna v1 (two way ANOVA): Há diferenças em auto gestão entre alunos de séries diferentes? Essa diferença é a mesma entre alunos com desemenho acima ou abaixo da mediana ? ANOVA 3 X 2 ( Escolaridade X Desempenho)

# Cria uma divisão pela mediana 
sennav1$m_notas2 <-cut(sennav1$m_notas, 
        breaks =c(0, median(sennav1$m_notas, na.rm =TRUE), 10))

sennav1$sexo2 <- factor(sennav1$SEXO)

# ANOVA VD: auto gestão VI's: escolaridade e sexo
fit <- aov(F1.Cons ~ esc2*m_notas2, data = sennav1)
summary(fit)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## esc2           2 13.228   6.614  13.446 1.54e-05 ***
## m_notas2       1  2.413   2.413   4.906   0.0306 *  
## esc2:m_notas2  2  2.599   1.300   2.642   0.0796 .  
## Residuals     59 29.020   0.492                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
# Comparações post-hoc
TukeyHSD(fit)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = F1.Cons ~ esc2 * m_notas2, data = sennav1)
## 
## $esc2
##         diff       lwr         upr     p adj
## 7-5 -0.58061 -1.089540 -0.07168007 0.0216407
## 9-5 -1.12216 -1.642528 -0.60179196 0.0000082
## 9-7 -0.54155 -1.050480 -0.03262000 0.0345090
## 
## $m_notas2
##                         diff        lwr       upr     p adj
## (7.33,10]-(0,7.33] 0.3617318 0.01355727 0.7099064 0.0419823
## 
## $`esc2:m_notas2`
##                                diff         lwr        upr     p adj
## 7:(0,7.33]-5:(0,7.33]   -0.01388889 -1.04675554  1.0189778 1.0000000
## 9:(0,7.33]-5:(0,7.33]   -0.37843137 -1.37627536  0.6194126 0.8723959
## 5:(7.33,10]-5:(0,7.33]   1.01111111  0.01326712  2.0089551 0.0452725
## 7:(7.33,10]-5:(0,7.33]   0.31124975 -0.73714960  1.3596491 0.9511430
## 9:(7.33,10]-5:(0,7.33]  -0.45370370 -1.64635539  0.7389480 0.8709519
## 9:(0,7.33]-7:(0,7.33]   -0.36454248 -1.16459755  0.4355126 0.7604845
## 5:(7.33,10]-7:(0,7.33]   1.02500000  0.22494493  1.8250551 0.0048276
## 7:(7.33,10]-7:(0,7.33]   0.32513864 -0.53714710  1.1874244 0.8750733
## 9:(7.33,10]-7:(0,7.33]  -0.43981481 -1.47268147  0.5930518 0.8081073
## 5:(7.33,10]-9:(0,7.33]   1.38954248  0.63524333  2.1438416 0.0000164
## 7:(7.33,10]-9:(0,7.33]   0.68968112 -0.13032851  1.5096908 0.1476458
## 9:(7.33,10]-9:(0,7.33]  -0.07527233 -1.07311632  0.9225717 0.9999217
## 7:(7.33,10]-5:(7.33,10] -0.69986136 -1.51987099  0.1201483 0.1366823
## 9:(7.33,10]-5:(7.33,10] -1.46481481 -2.46265880 -0.4669708 0.0008166
## 9:(7.33,10]-7:(7.33,10] -0.76495346 -1.81335281  0.2834459 0.2771982
# Figuras
print(model.tables(fit,"means"),digits=3)       
## Tables of means
## Grand mean
##          
## 3.506083 
## 
##  esc2 
##         5     7     9
##      4.07  3.49  2.95
## rep 21.00 23.00 21.00
## 
##  m_notas2 
##     (0,7.33] (7.33,10]
##         3.33      3.69
## rep    33.00     32.00
## 
##  esc2:m_notas2 
##      m_notas2
## esc2  (0,7.33] (7.33,10]
##   5    3.35     4.36    
##   rep  6.00    15.00    
##   7    3.34     3.66    
##   rep 12.00    11.00    
##   9    2.97     2.90    
##   rep 15.00     6.00
boxplot(F1.Cons ~ esc2*m_notas2,data=sennav1)

interaction2wt(F1.Cons ~ esc2*m_notas2, data = sennav1)

Análise de regressão múltipla? Qual contribuição única para predizer notas das variáveis socioemocionais ?

# Correlação entre notas e habilidades socioemocionais 

library(sjPlot)
sjt.corr(sennav1[ , c("s_faltas", "m_notas", "F1.Cons", "F2.Extr", "F3.EmSt", "F4.Agre", "F5.Opns", "F6.NVLoc")] , triangle = "lower")

# Regressão múltipla VD=Notas e VI's variáveis do Senna
fit2 <- lm( m_notas~F1.Cons+F2.Extr+F3.EmSt+F4.Agre+F5.Opns+F6.NVLoc, data=sennav1)
summary(fit2) 
## 
## Call:
## lm(formula = m_notas ~ F1.Cons + F2.Extr + F3.EmSt + F4.Agre + 
##     F5.Opns + F6.NVLoc, data = sennav1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40730 -0.53414  0.07799  0.61265  2.09773 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.72321    1.11784   4.225 8.54e-05 ***
## F1.Cons      0.85598    0.24183   3.540 0.000797 ***
## F2.Extr     -0.07189    0.25169  -0.286 0.776194    
## F3.EmSt     -0.44010    0.23712  -1.856 0.068537 .  
## F4.Agre      0.85238    0.26088   3.267 0.001826 ** 
## F5.Opns     -0.40424    0.25114  -1.610 0.112914    
## F6.NVLoc    -0.16230    0.25627  -0.633 0.529008    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9748 on 58 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3861, Adjusted R-squared:  0.3226 
## F-statistic:  6.08 on 6 and 58 DF,  p-value: 5.513e-05
# Regressão múltipla VD=Frequência e VI's variáveis do Senna
fit3 <- lm( s_faltas~F1.Cons+F2.Extr+F3.EmSt+F4.Agre+F5.Opns+F6.NVLoc, data=sennav1)
summary(fit3) 
## 
## Call:
## lm(formula = s_faltas ~ F1.Cons + F2.Extr + F3.EmSt + F4.Agre + 
##     F5.Opns + F6.NVLoc, data = sennav1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.074 -12.285  -2.660   9.783  44.342 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   72.786     19.092   3.812 0.000331 ***
## F1.Cons      -13.230      4.109  -3.220 0.002085 ** 
## F2.Extr       -3.470      4.305  -0.806 0.423382    
## F3.EmSt        4.461      4.021   1.109 0.271822    
## F4.Agre       -6.334      4.458  -1.421 0.160688    
## F5.Opns        6.987      4.281   1.632 0.108001    
## F6.NVLoc      -5.234      4.374  -1.196 0.236288    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.68 on 59 degrees of freedom
## Multiple R-squared:  0.2506, Adjusted R-squared:  0.1744 
## F-statistic: 3.289 on 6 and 59 DF,  p-value: 0.007376
sjp.lm(fit2, type = "std")

sjp.lm(fit3, type = "std")

VD: notas

    m_notas
    B CI std. Beta CI p
(Intercept)   4.72 2.49 – 6.96     <.001
F1.Cons   0.86 0.37 – 1.34 0.62 0.28 – 0.96 .001
F2.Extr   -0.07 -0.58 – 0.43 -0.03 -0.26 – 0.20 .776
F3.EmSt   -0.44 -0.91 – 0.03 -0.30 -0.61 – 0.02 .069
F4.Agre   0.85 0.33 – 1.37 0.41 0.17 – 0.66 .002
F5.Opns   -0.40 -0.91 – 0.10 -0.22 -0.48 – 0.05 .113
F6.NVLoc   -0.16 -0.68 – 0.35 -0.08 -0.33 – 0.17 .529
Observations   65
R2 / adj. R2   .386 / .323

VD: faltas

    s_faltas
    B CI std. Beta CI p
(Intercept)   72.79 34.58 – 110.99     <.001
F1.Cons   -13.23 -21.45 – -5.01 -0.62 -0.99 – -0.24 .002
F2.Extr   -3.47 -12.08 – 5.14 -0.10 -0.35 – 0.15 .423
F3.EmSt   4.46 -3.59 – 12.51 0.19 -0.15 – 0.54 .272
F4.Agre   -6.33 -15.25 – 2.59 -0.20 -0.47 – 0.07 .161
F5.Opns   6.99 -1.58 – 15.55 0.24 -0.05 – 0.53 .108
F6.NVLoc   -5.23 -13.99 – 3.52 -0.17 -0.44 – 0.11 .236
Observations   66
R2 / adj. R2   .251 / .174

Estudo de padrões desenvolvimentais e auto gestão: revisitando a relação entre nota, escolaridade e auto gestão usando regressão múltipla

  F1.Cons m_notas IDADE
F1.Cons      
m_notas 0.495***    
IDADE -0.504*** -0.378**  
Computed correlation used spearman-method with listwise-deletion.
  F4.Agre m_notas IDADE
F4.Agre      
m_notas 0.424***    
IDADE -0.277* -0.378**  
Computed correlation used spearman-method with listwise-deletion.
# Correlação entre notas e habilidades socioemocionais 

library(sjPlot)
sjt.corr(sennav1[ , c("F1.Cons", "m_notas", "IDADE")], triangle="lower")

sjt.corr(sennav1[ , c("F4.Agre", "m_notas", "IDADE")], triangle="lower")

# Cria escolaridade como uma variável "factor"


fit4 <- lm(F1.Cons~IDADE*m_notas2, data=sennav1)
summary(fit4) # show results
## 
## Call:
## lm(formula = F1.Cons ~ IDADE * m_notas2, data = sennav1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.58980 -0.37807  0.02131  0.53516  1.68798 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.44198    1.09643   4.051 0.000146 ***
## IDADE                   -0.09980    0.08583  -1.163 0.249448    
## m_notas2(7.33,10]        4.99861    1.64481   3.039 0.003492 ** 
## IDADE:m_notas2(7.33,10] -0.37367    0.13429  -2.783 0.007167 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6918 on 61 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3822, Adjusted R-squared:  0.3518 
## F-statistic: 12.58 on 3 and 61 DF,  p-value: 1.668e-06
sjp.lm(fit4, type = "std")

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:latticeExtra':
## 
##     layer
ggplot(data=sennav1, aes(x=IDADE, y=F1.Cons, color = m_notas2)) + geom_point() + geom_smooth(method="lm", se=FALSE)       

    F1.Cons
    B CI std. Beta CI p
(Intercept)   4.44 2.25 – 6.63     <.001
IDADE   -0.10 -0.27 – 0.07 -0.16 -0.43 – 0.11 .249
m_notas2(7.33,10]   5.00 1.71 – 8.29 2.93 1.04 – 4.82 .003
IDADE:m_notas2(7.33,10]   -0.37 -0.64 – -0.11 -2.61 -4.45 – -0.77 .007
Observations   65
R2 / adj. R2   .382 / .352

Estudo dos coeficientes não padronizados e padronizados na regressão múltipla ?

# Separa base so com as variáveis de interesse 
dt <- sennav1[ , c("m_notas", "F1.Cons", "F2.Extr", 
                   "F3.EmSt", "F4.Agre", "F5.Opns", "F6.NVLoc")]

# Roda regressão múltipla
fit2 <- lm( m_notas~F1.Cons+F2.Extr+F3.EmSt+F4.Agre+F5.Opns+F6.NVLoc, data=dt)
# sjt.lm(fit2, showStdBeta = TRUE) 

# Estatísticas descritivas 
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:HH':
## 
##     logit
describe(dt)
##          vars  n mean   sd median trimmed  mad  min  max range  skew
## m_notas     1 65 7.33 1.18   7.33    7.33 1.27 4.67 9.88  5.21  0.00
## F1.Cons     2 66 3.50 0.85   3.36    3.51 0.86 1.22 5.00  3.78 -0.17
## F2.Extr     3 66 3.41 0.54   3.37    3.40 0.53 2.14 5.00  2.86  0.26
## F3.EmSt     4 66 3.30 0.80   3.16    3.32 1.07 1.12 4.90  3.78 -0.16
## F4.Agre     5 66 3.53 0.57   3.50    3.51 0.54 2.25 4.83  2.58  0.23
## F5.Opns     6 66 3.26 0.64   3.31    3.30 0.64 1.62 4.50  2.88 -0.49
## F6.NVLoc    7 66 2.39 0.58   2.27    2.34 0.44 1.50 4.38  2.88  0.93
##          kurtosis   se
## m_notas     -0.66 0.15
## F1.Cons     -0.27 0.11
## F2.Extr      0.07 0.07
## F3.EmSt     -0.63 0.10
## F4.Agre     -0.35 0.07
## F5.Opns      0.18 0.08
## F6.NVLoc     0.71 0.07
# Transforma variáveis em z e mostra estatísticas descritivas
dtz <- as.data.frame(scale(dt))
describe(dtz)
##          vars  n mean sd median trimmed  mad   min  max range  skew
## m_notas     1 65    0  1   0.01    0.00 1.07 -2.24 2.15  4.40  0.00
## F1.Cons     2 66    0  1  -0.16    0.02 1.01 -2.67 1.76  4.43 -0.17
## F2.Extr     3 66    0  1  -0.07   -0.02 0.97 -2.33 2.92  5.25  0.26
## F3.EmSt     4 66    0  1  -0.18    0.02 1.34 -2.73 2.01  4.74 -0.16
## F4.Agre     5 66    0  1  -0.05   -0.03 0.95 -2.24 2.28  4.52  0.23
## F5.Opns     6 66    0  1   0.07    0.05 1.01 -2.58 1.95  4.53 -0.49
## F6.NVLoc    7 66    0  1  -0.19   -0.08 0.77 -1.53 3.42  4.95  0.93
##          kurtosis   se
## m_notas     -0.66 0.12
## F1.Cons     -0.27 0.12
## F2.Extr      0.07 0.12
## F3.EmSt     -0.63 0.12
## F4.Agre     -0.35 0.12
## F5.Opns      0.18 0.12
## F6.NVLoc     0.71 0.12
fit3 <- lm( m_notas~F1.Cons+F2.Extr+F3.EmSt+F4.Agre+F5.Opns+F6.NVLoc, data=dtz)
sjt.lm(fit3, show.std = TRUE)

# install.packages("ggExtra")
library(ggplot2)
library(ggExtra)

# 
p <- ggplot(data=dt, aes(x=F1.Cons, y=m_notas)) + 
        geom_point() + geom_smooth(method="lm") +
        labs(title="SENNA v1.0", x="Auto Gestão", y="Média das Notas") +
        scale_y_continuous(breaks=seq(1, 10, 1)) +
        scale_x_continuous(breaks=seq(1, 5, 0.5)) +
        geom_hline(yintercept=c(7.33, 7.33+1.18,  7.33-1.18), 
                   colour="green", linetype="longdash") +
        geom_vline(xintercept=c(3.50, 3.50+0.85,  3.50-0.85), 
                   colour="red", linetype="longdash")


ggMarginal(p, type = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 <- ggplot(data=dtz, aes(x=F1.Cons, y=m_notas)) + 
        geom_point() + geom_smooth(method="lm") +
        labs(title="SENNA v1.0", x="Auto Gestão", y="Média das Notas") +
        scale_y_continuous(breaks=seq(-3, 3, 0.5)) +
        scale_x_continuous(breaks=seq(-3, 3, 0.5)) +
        geom_hline(yintercept=c(0, 1, -1), 
                   colour="green", linetype="longdash") +
        geom_vline(xintercept=c(0, 1,  -1), 
                   colour="red", linetype="longdash")


ggMarginal(p2, type = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualização da regressão múltipla

# http://www.statmethods.net/graphs/scatterplot.html
# 3D Scatterplot with Coloring and Vertical Lines
# and Regression Plane 

# install.packages("scatterplot3d")
library(scatterplot3d) 
attach(dtz)
fit <- lm( m_notas~F1.Cons+F4.Agre) 

s3d <-scatterplot3d(F1.Cons, F4.Agre, m_notas, pch=16, 
                    highlight.3d=TRUE, type="h")

s3d$plane3d(fit)

Exercício 4