Abdulrazzag M. Falah

Data Scientist & Education Specialist.




Exercise 3.7





Loading required packages

list.of.packages <- c("foreign", "kableExtra", "ggplot2", "data.table","lme4","knitr", "rockchalk","multcomp")

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only=TRUE)
## Loading required package: foreign
## Loading required package: kableExtra
## Loading required package: ggplot2
## Loading required package: data.table
## Loading required package: lme4
## Loading required package: Matrix
## Loading required package: knitr
## Loading required package: rockchalk
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:rockchalk':
## 
##     mvrnorm
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
## 
## [[8]]
## [1] TRUE

Reading the Data

3.7. High-school-and-beyond data Solutions Raudenbush and Bryk (2002) and Raudenbush et al. (2004) analyzed data from the High School and Beyond Survey. The variables in the dataset hsb.dta that we will use here are:

hsb <- read.dta("http://www.stata-press.com/data/mlmus3/hsb.dta")
head(hsb)
##   minority female    ses mathach size sector pracad disclim himinty
## 1        0      1 -1.528   5.876  842      0   0.35   1.597       0
## 2        0      1 -0.588  19.708  842      0   0.35   1.597       0
## 3        0      0 -0.528  20.349  842      0   0.35   1.597       0
## 4        0      0 -0.668   8.781  842      0   0.35   1.597       0
## 5        0      0 -0.158  17.898  842      0   0.35   1.597       0
## 6        0      0  0.022   4.583  842      0   0.35   1.597       0
##   schoolid     mean       sd    sdalt     junk   sdalt2 num       se
## 1     1224 9.715446 7.592785 6.256328 45.71077 48.39363  47 1.107522
## 2     1224 9.715446 7.592785 6.256328 49.99941 48.39363  47 1.107522
## 3     1224 9.715446 7.592785 6.256328 59.47536 48.39363  47 1.107522
## 4     1224 9.715446 7.592785 6.256328 14.86853 48.39363  47 1.107522
## 5     1224 9.715446 7.592785 6.256328 27.67840 48.39363  47 1.107522
## 6     1224 9.715446 7.592785 6.256328 64.86649 48.39363  47 1.107522
##       sealt   sealt2       t2    t2alt pickone     mmses     mnses
## 1 0.9125792 1.014718 6.958498 8.289523       1 -0.434383 -0.434383
## 2 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 3 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 4 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 5 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
## 6 0.9125792 1.014718 6.958498 8.289523       0 -0.434383 -0.434383
##         xb     resid
## 1 10.13759 -4.261589
## 2 10.13759  9.570412
## 3 10.13759 10.211412
## 4 10.13759 -1.356588
## 5 10.13759  7.760412
## 6 10.13759 -5.554588
summarize(hsb)
## $numerics
##            disclim    female   himinty      junk   mathach      mean
## 0%         -2.4160    0.0000    0.0000    0.0000   -2.8320    4.2398
## 25%        -0.8170    0.0000    0.0000    7.0704    7.2750   10.8837
## 50%        -0.2310    1.0000    0.0000   30.6254   13.1310   13.1601
## 75%         0.4600    1.0000    1.0000   75.1511   18.3170   14.6967
## 100%        2.7560    1.0000    1.0000  239.2892   24.9930   19.7191
## mean       -0.1319    0.5282    0.2800   47.3160   12.7479   12.7479
## sd          0.9440    0.4992    0.4490   48.8976    6.8782    3.0058
## var         0.8911    0.2492    0.2016 2390.9763   47.3103    9.0349
## skewness    0.2394   -0.1129    0.9796    1.2998   -0.1805   -0.2712
## kurtosis   -0.1588   -1.9875   -1.0405    1.4620   -0.9216   -0.0507
## NA's        0.0000    0.0000    0.0000    0.0000    0.0000    0.0000
## N        7185.0000 7185.0000 7185.0000 7185.0000 7185.0000 7185.0000
##           minority     mmses     mnses       num   pickone    pracad
## 0%          0.0000   -1.1939   -1.1939   14.0000    0.0000    0.0000
## 25%         0.0000   -0.3230   -0.3230   41.0000    0.0000    0.3200
## 50%         0.0000    0.0320    0.0320   51.0000    0.0000    0.5300
## 75%         1.0000    0.3269    0.3269   57.0000    0.0000    0.7000
## 100%        1.0000    0.8250    0.8250   67.0000    1.0000    1.0000
## mean        0.2747    0.0001    0.0001   48.0163    0.0223    0.5345
## sd          0.4464    0.4135    0.4135   10.8222    0.1476    0.2512
## var         0.1993    0.1710    0.1710  117.1196    0.0218    0.0631
## skewness    1.0091   -0.2685   -0.2685   -0.5793    6.4739    0.1596
## kurtosis   -0.9819   -0.4789   -0.4789   -0.3704   39.9171   -0.8859
## NA's        0.0000    0.0000    0.0000    0.0000    0.0000    0.0000
## N        7185.0000 7185.0000 7185.0000 7185.0000 7185.0000 7185.0000
##              resid     schoolid        sd     sdalt    sdalt2        se
## 0%        -19.4889    1224.0000    3.5410    6.2563   48.3936    0.5059
## 25%        -4.7643    3020.0000    5.5903    6.2563   48.3936    0.7798
## 50%         0.2358    5192.0000    6.2985    6.2563   48.3936    0.8939
## 75%         5.1619    7342.0000    6.7990    6.2563   48.3936    1.0323
## 100%       16.4446    9586.0000    8.4811    6.2563   48.3936    1.8237
## mean        0.0624    5277.8978    6.1975    6.2563   48.3936    0.9190
## sd          6.4595    2499.5778    0.8637    0.0000    0.0000    0.2017
## var        41.7246 6247889.1555    0.7460    0.0000    0.0000    0.0407
## skewness   -0.1360       0.1074   -0.2358       NaN       NaN    1.1132
## kurtosis   -0.6856      -1.2546    0.2158       NaN       NaN    2.3226
## NA's        0.0000       0.0000    0.0000    0.0000    0.0000    0.0000
## N        7185.0000    7185.0000 7185.0000 7185.0000 7185.0000 7185.0000
##              sealt    sealt2    sector       ses        size        t2
## 0%          0.7643    0.8499    0.0000   -3.7580    100.0000    0.0007
## 25%         0.8287    0.9214    0.0000   -0.5380    565.0000    1.0800
## 50%         0.8761    0.9741    0.0000    0.0020   1016.0000    5.7493
## 75%         0.9771    1.0864    1.0000    0.6020   1436.0000   15.7333
## 100%        1.6721    1.8592    1.0000    2.6920   2713.0000  195.8106
## mean        0.9246    1.0281    0.4931    0.0001   1056.8618   14.6558
## sd          0.1292    0.1437    0.5000    0.7794    604.1725   26.4160
## var         0.0167    0.0206    0.2500    0.6074 365024.4089  697.8054
## skewness    1.5869    1.5869    0.0276   -0.2281      0.5715    3.6684
## kurtosis    3.2496    3.2496   -1.9995   -0.3804     -0.3649   16.8883
## NA's        0.0000    0.0000    0.0000    0.0000      0.0000    0.0000
## N        7185.0000 7185.0000 7185.0000 7185.0000   7185.0000 7185.0000
##              t2alt        xb
## 0%          0.0007    5.6839
## 25%         0.9578   10.7907
## 50%         4.3592   12.8722
## 75%        11.4854   14.6015
## 100%       52.8246   17.5219
## mean        8.5376   12.6855
## sd         11.0627    2.4248
## var       122.3842    5.8798
## skewness    2.0632   -0.2685
## kurtosis    4.2433   -0.4789
## NA's        0.0000    0.0000
## N        7185.0000 7185.0000
## 
## $factors
## NULL
  1. Use xtreg to fit a model for mathach with a fixed effect for SES and a random intercept for school.
model1 <- lmer(mathach ~ ses+(1|schoolid),data = hsb, REML = FALSE)
summary(model1)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: mathach ~ ses + (1 | schoolid)
##    Data: hsb
## 
##      AIC      BIC   logLik deviance df.resid 
##  46649.0  46676.5 -23320.5  46641.0     7181 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1263 -0.7277  0.0220  0.7578  2.9186 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schoolid (Intercept)  4.729   2.175   
##  Residual             37.030   6.085   
## Number of obs: 7185, groups:  schoolid, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6576     0.1873   67.57
## ses           2.3915     0.1057   22.63
## 
## Correlation of Fixed Effects:
##     (Intr)
## ses 0.003
tmp <- as.data.frame(confint(glht(model1))$confint)
tmp$Comparison <- rownames(tmp)
ggplot(tmp, aes(x = Comparison, y = Estimate, ymin = lwr, ymax = upr)) +
  geom_errorbar() + geom_point()

  1. Use xtsum to explore the between-school and within-school variability of SES.
Overall <- round(c(mean(hsb$ses),sd(hsb$ses),min(hsb$ses),max(hsb$ses)),7)
#Between
between.v<- round(aggregate(ses~schoolid, data=hsb,FUN = mean),7)
btwn <- c("",round(c(sd(between.v$ses),min(between.v$ses),max(between.v$ses)),7))
#Within
setDT(hsb)[, mn_ses := round(mean(ses),digits = 7), by = schoolid]
hsb$dev_ses <- hsb$ses - hsb$mn_ses
within.v <- c("",round(c(sd(hsb$dev_ses),min(hsb$dev_ses),max(hsb$dev_ses)),7))
dat<-as.data.frame(rbind(Overall, btwn, within.v))
rownames(dat)<- c("SES\t Overall","Between","Within")
colnames(dat) <- c("Mean","Std. Dev.", "Min.","Max")
kable(dat, format = "html", booktabs = F )%>%
  kable_styling(c("striped", "bordered", "hover"))%>%
  add_indent(c(2,3))%>%
  footnote(general=paste("N=",nrow(hsb),", Schools=",length(unique(hsb$schoolid)),", T-bar=",nrow(hsb)/length(unique(hsb$schoolid))),general_title="Observations")
Mean Std. Dev. Min. Max
SES Overall 0.0001434 0.7793552 -3.7579999 2.6919999
Between 0.4139706 -1.193946 0.8249825
Within 0.660588 -3.6507406 2.8560783
Observations
N= 7185 , Schools= 160 , T-bar= 44.90625
  1. Produce a variable, mn_ses, equal to the schools’ mean SES and another variable, dev_ses, equal to the difference between the students’ SES and the mean SES for their school.
#already done as part of Q2
  1. The model in step 1 assumes that SES has the same effect within and between schools. Check this by using the covariates mn_ses and dev-ses instead of ses and comparing the coefficients using lincom.
model3 <- lmer(mathach ~ mn_ses+dev_ses+(1|schoolid), data = hsb, REML = FALSE)
summary(model3)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: mathach ~ mn_ses + dev_ses + (1 | schoolid)
##    Data: hsb
## 
##      AIC      BIC   logLik deviance df.resid 
##  46573.8  46608.2 -23281.9  46563.8     7180 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1675 -0.7258  0.0176  0.7554  2.9446 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schoolid (Intercept)  2.647   1.627   
##  Residual             37.014   6.084   
## Number of obs: 7185, groups:  schoolid, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6836     0.1484   85.46
## mn_ses        5.8656     0.3594   16.32
## dev_ses       2.1912     0.1087   20.17
## 
## Correlation of Fixed Effects:
##         (Intr) mn_ses
## mn_ses  0.010        
## dev_ses 0.000  0.000
summary(glht(model3, linfct = c("mn_ses - dev_ses = 0")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lmer(formula = mathach ~ mn_ses + dev_ses + (1 | schoolid), data = hsb, 
##     REML = FALSE)
## 
## Linear Hypotheses:
##                       Estimate Std. Error z value Pr(>|z|)    
## mn_ses - dev_ses == 0   3.6744     0.3754   9.787   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
confint(glht(model3, linfct = c("mn_ses - dev_ses = 0")))
## 
##   Simultaneous Confidence Intervals
## 
## Fit: lmer(formula = mathach ~ mn_ses + dev_ses + (1 | schoolid), data = hsb, 
##     REML = FALSE)
## 
## Quantile = 1.96
## 95% family-wise confidence level
##  
## 
## Linear Hypotheses:
##                       Estimate lwr    upr   
## mn_ses - dev_ses == 0 3.6744   2.9386 4.4103
tmp <- as.data.frame(confint(glht(model3))$confint)
tmp$Comparison <- rownames(tmp)
ggplot(tmp, aes(x = Comparison, y = Estimate, ymin = lwr, ymax = upr)) +
  geom_errorbar() + geom_point()

  1. Interpret the coefficients of mn_ses and dev_ses.
    • A one-unit increase in mn_ses would equal to a 5.86 increase in math achievement.
    • Two students in the same school will differ by 1 ses point will differ on average by 2.19 math achievement points.
  2. Returning to the model with ses as the only covariate, perform a Hausman specification test and comment on the result.
phtest_lmer <- function (lmerMod, lmMod, ...)  { 
    coef.wi <- coef(lmMod)
    coef.re <- fixef(lmerMod)
    vcov.wi <- vcov(lmMod)
    vcov.re <- vcov(lmerMod)
    names.wi <- names(coef.wi)
    names.re <- names(coef.re)
    coef.h <- names.re[names.re %in% names.wi]
    dbeta <- coef.wi[coef.h] - coef.re[coef.h]
    df <- length(dbeta)
    dvcov <- vcov.re[coef.h, coef.h] - vcov.wi[coef.h, coef.h]
    stat <- abs(t(dbeta) %*% as.matrix(solve(dvcov)) %*% dbeta)
    pval <- pchisq(stat, df = df, lower.tail = FALSE)
    names(stat) <- "chisq"
    parameter <- df
    names(parameter) <- "df"
    alternative <- "The model is incorrectly specified."
    res <- list(statistic = stat, p.value = pval, parameter = parameter, 
        method = "Hausman Test",  alternative = alternative,
                data.name=deparse(getCall(lmerMod)$data))
    class(res) <- "htest"
    return(res)
}

model0 <- glm(mathach ~ ses +  schoolid,
                   data = hsb)
phtest_lmer(model1,model0)
## 
##  Hausman Test
## 
## data:  hsb
## chisq = 406.47, df = 2, p-value < 2.2e-16
## alternative hypothesis: The model is incorrectly specified.