Loading In Data

CASchool <- read_dta("caschool.dta") # this is in the same directory as this Rmd file
# see what it gives us
# CASchool # don't run this in knitted markdown, it will print 420 rows!!
 
# note since we have tidyverse loaded, it prints a nice tibble

# look only at first 10 rows
head(CASchool, n = 10)
observatdist_codcountydistrictgr_spanenrl_totteacherscalw_pctmeal_pctcomputertestscrcomp_stuexpn_stustravgincel_pctread_scrmath_scraowijefes_pctes_frac
17.51e+04AlamedaSunol Glen UnifiedKK-08195       10.90.512.04676910.344 6.38e+0317.922.7 0   69269035.81   0.01  
26.15e+04ButteManzanita ElementaryKK-08240       11.115.4 47.9 1016610.421 5.1e+03 21.59.824.5866066243  3.580.0358
36.15e+04ButteThermalito Union ElementaryKK-081.55e+0382.955   76.3 1696440.109 5.5e+03 18.78.9830   63665137.429   0.29  
46.15e+04ButteGolden Feather Union ElementaryKK-08243       14  36.5 77   856480.35  7.1e+03 17.48.980   65264434.71   0.01  
56.15e+04ButtePalermo Union ElementaryKK-081.34e+0371.533.1 78.4 1716410.128 5.24e+0318.79.0813.9 64264037.312.9 0.129 
66.2e+04 FresnoBurrel Union ElementaryKK-08137       6.412.3 87   256060.182 5.58e+0321.410.4 12.4 60660542.811.4 0.114 
76.85e+04San JoaquinHolt Union ElementaryKK-08195       10  12.9 94.6 286070.144 5.25e+0319.56.5868.7 60460939  67.7 0.677 
86.38e+04KernVineland ElementaryKK-08888       42.518.8 100   666090.07434.57e+0320.98.1747   60661241.846   0.46  
96.23e+04FresnoOrange Center ElementaryKK-08379       19  32.2 93.1 356120.09235.36e+0319.97.3930.1 60961639.929.1 0.291 
106.73e+04SacramentoDel Paso Heights ElementaryKK-062.25e+03108  79   87.3 06130     5.04e+0320.811.6 40.3 61261341.639.3 0.393 
# get structure
str(CASchool)
## tibble [420 × 21] (S3: tbl_df/tbl/data.frame)
##  $ observat: num [1:420] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ dist_cod: num [1:420] 75119 61499 61549 61457 61523 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ county  : chr [1:420] "Alameda" "Butte" "Butte" "Butte" ...
##   ..- attr(*, "format.stata")= chr "%18s"
##  $ district: chr [1:420] "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##   ..- attr(*, "format.stata")= chr "%53s"
##  $ gr_span : chr [1:420] "KK-08" "KK-08" "KK-08" "KK-08" ...
##   ..- attr(*, "format.stata")= chr "%8s"
##  $ enrl_tot: num [1:420] 195 240 1550 243 1335 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ teachers: num [1:420] 10.9 11.1 82.9 14 71.5 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ calw_pct: num [1:420] 0.51 15.42 55.03 36.48 33.11 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ meal_pct: num [1:420] 2.04 47.92 76.32 77.05 78.43 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ computer: num [1:420] 67 101 169 85 171 25 28 66 35 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ testscr : num [1:420] 691 661 644 648 641 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ comp_stu: num [1:420] 0.344 0.421 0.109 0.35 0.128 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expn_stu: num [1:420] 6385 5099 5502 7102 5236 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ str     : num [1:420] 17.9 21.5 18.7 17.4 18.7 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ avginc  : num [1:420] 22.69 9.82 8.98 8.98 9.08 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ el_pct  : num [1:420] 0 4.58 30 0 13.86 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ read_scr: num [1:420] 692 660 636 652 642 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ math_scr: num [1:420] 690 662 651 644 640 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ aowijef : num [1:420] 35.8 43 37.4 34.7 37.3 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ es_pct  : num [1:420] 1 3.58 29 1 12.86 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ es_frac : num [1:420] 0.01 0.0358 0.29 0.01 0.1286 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
# use tidyverse methods to look at structure
glimpse(CASchool)
## Rows: 420
## Columns: 21
## $ observat <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ dist_cod <dbl> 75119, 61499, 61549, 61457, 61523, 62042, 68536, 63834, 62331…
## $ county   <chr> "Alameda", "Butte", "Butte", "Butte", "Butte", "Fresno", "San…
## $ district <chr> "Sunol Glen Unified", "Manzanita Elementary", "Thermalito Uni…
## $ gr_span  <chr> "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08"…
## $ enrl_tot <dbl> 195, 240, 1550, 243, 1335, 137, 195, 888, 379, 2247, 446, 987…
## $ teachers <dbl> 10.90, 11.15, 82.90, 14.00, 71.50, 6.40, 10.00, 42.50, 19.00,…
## $ calw_pct <dbl> 0.5102, 15.4167, 55.0323, 36.4754, 33.1086, 12.3188, 12.9032,…
## $ meal_pct <dbl> 2.0408, 47.9167, 76.3226, 77.0492, 78.4270, 86.9565, 94.6237,…
## $ computer <dbl> 67, 101, 169, 85, 171, 25, 28, 66, 35, 0, 86, 56, 25, 0, 31, …
## $ testscr  <dbl> 690.80, 661.20, 643.60, 647.70, 640.85, 605.55, 606.75, 609.0…
## $ comp_stu <dbl> 0.34358975, 0.42083332, 0.10903226, 0.34979424, 0.12808989, 0…
## $ expn_stu <dbl> 6384.911, 5099.381, 5501.955, 7101.831, 5235.988, 5580.147, 5…
## $ str      <dbl> 17.88991, 21.52466, 18.69723, 17.35714, 18.67133, 21.40625, 1…
## $ avginc   <dbl> 22.690001, 9.824000, 8.978000, 8.978000, 9.080333, 10.415000,…
## $ el_pct   <dbl> 0.000000, 4.583333, 30.000002, 0.000000, 13.857677, 12.408759…
## $ read_scr <dbl> 691.6, 660.5, 636.3, 651.9, 641.8, 605.7, 604.5, 605.5, 608.9…
## $ math_scr <dbl> 690.0, 661.9, 650.9, 643.5, 639.9, 605.4, 609.0, 612.5, 616.1…
## $ aowijef  <dbl> 35.77982, 43.04933, 37.39445, 34.71429, 37.34266, 42.81250, 3…
## $ es_pct   <dbl> 1.000000, 3.583333, 29.000002, 1.000000, 12.857677, 11.408759…
## $ es_frac  <dbl> 0.01000000, 0.03583334, 0.29000002, 0.01000000, 0.12857677, 0…

Scatterplot

# save as scatter
scatter <- ggplot(data = CASchool)+
  aes(x = str,
      y = testscr)+
  geom_point(color = "blue")+
  labs(x = "Student to Teacher Ratio",
       y = "Test Score")+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size = 20)

# look at it 
scatter

Simple Linear Regression (1 Variable)

# run simple regression, save as "reg"
reg <- lm(testscr ~ str, data = CASchool)

# print it (just gives us the coefficients)
reg
## 
## Call:
## lm(formula = testscr ~ str, data = CASchool)
## 
## Coefficients:
## (Intercept)          str  
##      698.93        -2.28
# get full summary
summary(reg)
## 
## Call:
## lm(formula = testscr ~ str, data = CASchool)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.727 -14.251   0.483  12.822  48.540 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 698.9330     9.4675  73.825  < 2e-16 ***
## str          -2.2798     0.4798  -4.751 2.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.58 on 418 degrees of freedom
## Multiple R-squared:  0.05124,    Adjusted R-squared:  0.04897 
## F-statistic: 22.58 on 1 and 418 DF,  p-value: 2.783e-06
# Note: this also works (tidyverse): 
# reg %>% summary()
# use broom for tidier output

# tidy() gives us coefficients
reg %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)699   9.4773.8 6.57e-242
str-2.280.48-4.752.78e-06 
# glance() gives us regression fit statistics
reg %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.05120.04918.622.62.78e-061-1.82e+033.65e+033.66e+031.44e+05418420
# augment() creates observations for fitted and residual values (among other things)
reg %>% augment() %>% head() # I added head() because otherwise it will print 420 rows!!
testscrstr.fitted.resid.hat.sigma.cooksd.std.resid
69117.965832.70.0044218.50.00689 1.76 
66121.565011.30.0047518.60.0008930.612
64418.7656-12.70.0029718.60.0007  -0.685
64817.4659-11.70.0058618.60.00117 -0.629
64118.7656-15.50.0030118.60.00105 -0.836
60621.4650-44.60.0044618.50.013   -2.4  

You can type out the regression equation in markdown using LaTeX math code:

\[\widehat{\text{test score}}=689.9-2.28 \times str\]

Or you can try out equatiomatic:

# be sure to set chunk option to: results="asis"

extract_eq(reg, # regression lm object
           use_coefs = TRUE, # use names of variables
           coef_digits = 2, # round to 2 digits
           fix_signs = TRUE) # fix negatives (instead of + -)

\[ \operatorname{\widehat{testscr}} = 698.93 - 2.28(\operatorname{str}) \]

Now let’s go back to the scatterplot and add our regression line!

scatter+
  geom_smooth(method = "lm", # at minimum, must set method to linear model
              color = "red", # change color
              se = TRUE) # set to FALSE if you want to hide the SE range (gray)
## `geom_smooth()` using formula 'y ~ x'

Now let’s print a nice regression output table

huxreg(reg)
(1)
(Intercept)698.933 ***
(9.467)   
str-2.280 ***
(0.480)   
N420        
R20.051    
logLik-1822.250    
AIC3650.499    
*** p < 0.001; ** p < 0.01; * p < 0.05.

We can customize it a bit

huxreg("Test Score" = reg, # set Y name
       coefs = c("Intercept" = "(Intercept)", # rename X variable & intercept
                 "STR" = "str"),
       statistics = c("N" = "nobs", # show only these stats and name them
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2) # round to 2 decimals
Test Score
Intercept698.93 ***
(9.47)   
STR-2.28 ***
(0.48)   
N420       
R-Squared0.05    
SER18.58    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Regression Analysis

# first let's use augment again, and *save* as aug_reg
aug_reg <- augment(reg)

# now we can plot a histogram using aug_reg

ggplot(data = aug_reg)+
  aes(x = .resid)+ # the residuals!
  geom_histogram(color = "white",
                 fill = "hotpink")+
  labs(x = expression(paste("Residual, ", hat(u))))+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size=20)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = aug_reg)+
  aes(x = str,
      y = .resid)+
  geom_point(color="blue")+
  geom_hline(aes(yintercept = 0), color="red")+
  labs(x = "Student to Teacher Ratio",
       y = expression(paste("Residual, ", hat(u))))+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size=20)

Testing for heteroskedasticity

# using package lmtest
bptest(reg)
## 
##  studentized Breusch-Pagan test
## 
## data:  reg
## BP = 5.7936, df = 1, p-value = 0.01608
# using package estimatr
reg_robust <-lm_robust(testscr ~ str,
                              data = CASchool,
                              se_type = "stata")

reg_robust %>% summary()
## 
## Call:
## lm_robust(formula = testscr ~ str, data = CASchool, se_type = "stata")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper  DF
## (Intercept)   698.93    10.3644  67.436 9.487e-227  678.560  719.306 418
## str            -2.28     0.5195  -4.389  1.447e-05   -3.301   -1.259 418
## 
## Multiple R-squared:  0.05124 ,   Adjusted R-squared:  0.04897 
## F-statistic: 19.26 on 1 and 418 DF,  p-value: 1.447e-05
huxreg("Normal" = reg,
       "Robust" = reg_robust,
       coefs = c("Intercept" = "(Intercept)",
                 "STR" = "str"),
       statistics = c("N" = "nobs",
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2)
NormalRobust
Intercept698.93 ***698.93 ***
(9.47)   (10.36)   
STR-2.28 ***-2.28 ***
(0.48)   (0.52)   
N420       420       
R-Squared0.05    0.05    
SER18.58           
*** p < 0.001; ** p < 0.01; * p < 0.05.

Check for outliers

# using car package
outlierTest(reg) # will point out which obs #s seem outliers
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 417 2.636833           0.008681           NA

Multivariate Regressions

Correlation Table

CASchool %>%
  select("str","testscr","el_pct", "expn_stu") %>%
  cor()
##                 str    testscr      el_pct    expn_stu
## str       1.0000000 -0.2263628  0.18764237 -0.61998215
## testscr  -0.2263628  1.0000000 -0.64412374  0.19127277
## el_pct    0.1876424 -0.6441237  1.00000000 -0.07139604
## expn_stu -0.6199821  0.1912728 -0.07139604  1.00000000

Conditional Distribution (of Test Score) by Low/High ESL

# make a new variable called EL
# = high (if el_pct is above median) or = low (if below median)
CASchool<-CASchool %>% # next we create a new dummy variable called ESL
  mutate(ESL = ifelse(el_pct > median(el_pct), # test if ESL is above median
                     yes = "High ESL", # if yes, call this variable "High ESL"
                     no = "Low ESL")) # if no, call this variable "Low ESL"

# get average test score by high/low EL
CASchool %>%
  group_by(ESL) %>%
  summarize(Average_test_score=mean(testscr))
ESLAverage_test_score
High ESL644
Low ESL664
# make distributions
ggplot(data = CASchool)+
  aes(x = testscr,
      fill = ESL)+
  geom_density(alpha=0.5)+
  labs(x = "Test Score",
       y = "Density")+
  ggthemes::theme_pander(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")

# make scatterplot

esl_scatter<-ggplot(data = CASchool)+
  aes(x = str,
      y = testscr,
      color = ESL)+
  geom_point()+
  geom_smooth(method="lm")+
  labs(x = "STR",
       y = "Test Score")+
  ggthemes::theme_pander(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")
esl_scatter
## `geom_smooth()` using formula 'y ~ x'

# facet instead

esl_scatter+
  facet_grid(~ESL)+ #<<
  guides(color = F) #<<
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'

Multivariate Regression (with %EL)

# run regression of testscr on str and el_pct
school_reg_2 <- lm(testscr ~ str + el_pct, 
                 data = CASchool)

# look at it (just gives coefficients)
school_reg_2
## 
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
## 
## Coefficients:
## (Intercept)          str       el_pct  
##    686.0322      -1.1013      -0.6498
# get summary
school_reg_2 %>% summary()
## 
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.845 -10.240  -0.308   9.815  43.461 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 686.03225    7.41131  92.566  < 2e-16 ***
## str          -1.10130    0.38028  -2.896  0.00398 ** 
## el_pct       -0.64978    0.03934 -16.516  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.46 on 417 degrees of freedom
## Multiple R-squared:  0.4264, Adjusted R-squared:  0.4237 
## F-statistic:   155 on 2 and 417 DF,  p-value: < 2.2e-16
# do it tider with broom
school_reg_2 %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)686   7.41  92.63.87e-280
str-1.1 0.38  -2.90.00398  
el_pct-0.650.0393-16.51.66e-47 
# get regression fit statistics
school_reg_2 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.4260.42414.51554.62e-512-1.72e+033.44e+033.46e+038.72e+04417420
# make a regression output table
library(huxtable)
huxreg("Model 1" = reg,
       "Model 2" = school_reg_2,
       coefs = c("Intercept" = "(Intercept)",
                 "Class Size" = "str",
                 "%ESL Students" = "el_pct"),
       statistics = c("N" = "nobs",
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2)
Model 1Model 2
Intercept698.93 ***686.03 ***
(9.47)   (7.41)   
Class Size-2.28 ***-1.10 ** 
(0.48)   (0.38)   
%ESL Students       -0.65 ***
       (0.04)   
N420       420       
R-Squared0.05    0.43    
SER18.58    14.46    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Measuring OVB

# "true" regression (str and %EL)
true<-lm(testscr~str+el_pct, data=CASchool)
true %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)686   7.41  92.63.87e-280
str-1.1 0.38  -2.90.00398  
el_pct-0.650.0393-16.51.66e-47 
# "omitted" regression (omitting %EL)
omitted<-lm(testscr~str, data=CASchool)
omitted %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)699   9.4773.8 6.57e-242
str-2.280.48-4.752.78e-06 
# "auxiliary" regression (%EL on str)
aux<-lm(el_pct~str, data=CASchool)
aux %>% tidy()
termestimatestd.errorstatisticp.value
(Intercept)-19.9 9.16 -2.170.0308 
str1.810.4643.910.00011

Variance and VIF

library(car) # needs car package
true %>% vif()
##      str   el_pct 
## 1.036495 1.036495
# calculate VIF manually

# look at aux reg stats for R^2
# extract our R-squared from aux regression (R_j^2)
aux_r_sq<-glance(aux) %>%
  select(r.squared)

aux_r_sq # look at it
r.squared
0.0352
# calculate VIF manually

our_vif<-1/(1-aux_r_sq) # VIF formula 

our_vif
r.squared
1.04