Loading In Data

CASchool <- read_dta("caschool.dta") # this is in the same directory as this Rmd file

# see what it gives us
# CASchool # don't run this in knitted markdown, it will print 420 rows!!
 
# note since we have tidyverse loaded, it prints a nice tibble

# look only at first 10 rows
head(CASchool, n = 10)

observat	dist_cod	county	district	gr_span	enrl_tot	teachers	calw_pct	meal_pct	computer	testscr	comp_stu	expn_stu	str	avginc	el_pct	read_scr	math_scr	aowijef	es_pct	es_frac
1	7.51e+04	Alameda	Sunol Glen Unified	KK-08	195	10.9	0.51	2.04	67	691	0.344	6.38e+03	17.9	22.7	0	692	690	35.8	1	0.01
2	6.15e+04	Butte	Manzanita Elementary	KK-08	240	11.1	15.4	47.9	101	661	0.421	5.1e+03	21.5	9.82	4.58	660	662	43	3.58	0.0358
3	6.15e+04	Butte	Thermalito Union Elementary	KK-08	1.55e+03	82.9	55	76.3	169	644	0.109	5.5e+03	18.7	8.98	30	636	651	37.4	29	0.29
4	6.15e+04	Butte	Golden Feather Union Elementary	KK-08	243	14	36.5	77	85	648	0.35	7.1e+03	17.4	8.98	0	652	644	34.7	1	0.01
5	6.15e+04	Butte	Palermo Union Elementary	KK-08	1.34e+03	71.5	33.1	78.4	171	641	0.128	5.24e+03	18.7	9.08	13.9	642	640	37.3	12.9	0.129
6	6.2e+04	Fresno	Burrel Union Elementary	KK-08	137	6.4	12.3	87	25	606	0.182	5.58e+03	21.4	10.4	12.4	606	605	42.8	11.4	0.114
7	6.85e+04	San Joaquin	Holt Union Elementary	KK-08	195	10	12.9	94.6	28	607	0.144	5.25e+03	19.5	6.58	68.7	604	609	39	67.7	0.677
8	6.38e+04	Kern	Vineland Elementary	KK-08	888	42.5	18.8	100	66	609	0.0743	4.57e+03	20.9	8.17	47	606	612	41.8	46	0.46
9	6.23e+04	Fresno	Orange Center Elementary	KK-08	379	19	32.2	93.1	35	612	0.0923	5.36e+03	19.9	7.39	30.1	609	616	39.9	29.1	0.291
10	6.73e+04	Sacramento	Del Paso Heights Elementary	KK-06	2.25e+03	108	79	87.3	0	613	0	5.04e+03	20.8	11.6	40.3	612	613	41.6	39.3	0.393

# get structure
str(CASchool)

## tibble [420 × 21] (S3: tbl_df/tbl/data.frame)
##  $ observat: num [1:420] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ dist_cod: num [1:420] 75119 61499 61549 61457 61523 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ county  : chr [1:420] "Alameda" "Butte" "Butte" "Butte" ...
##   ..- attr(*, "format.stata")= chr "%18s"
##  $ district: chr [1:420] "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##   ..- attr(*, "format.stata")= chr "%53s"
##  $ gr_span : chr [1:420] "KK-08" "KK-08" "KK-08" "KK-08" ...
##   ..- attr(*, "format.stata")= chr "%8s"
##  $ enrl_tot: num [1:420] 195 240 1550 243 1335 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ teachers: num [1:420] 10.9 11.1 82.9 14 71.5 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ calw_pct: num [1:420] 0.51 15.42 55.03 36.48 33.11 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ meal_pct: num [1:420] 2.04 47.92 76.32 77.05 78.43 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ computer: num [1:420] 67 101 169 85 171 25 28 66 35 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ testscr : num [1:420] 691 661 644 648 641 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ comp_stu: num [1:420] 0.344 0.421 0.109 0.35 0.128 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expn_stu: num [1:420] 6385 5099 5502 7102 5236 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ str     : num [1:420] 17.9 21.5 18.7 17.4 18.7 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ avginc  : num [1:420] 22.69 9.82 8.98 8.98 9.08 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ el_pct  : num [1:420] 0 4.58 30 0 13.86 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ read_scr: num [1:420] 692 660 636 652 642 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ math_scr: num [1:420] 690 662 651 644 640 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ aowijef : num [1:420] 35.8 43 37.4 34.7 37.3 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ es_pct  : num [1:420] 1 3.58 29 1 12.86 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ es_frac : num [1:420] 0.01 0.0358 0.29 0.01 0.1286 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"

# use tidyverse methods to look at structure
glimpse(CASchool)

## Rows: 420
## Columns: 21
## $ observat <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ dist_cod <dbl> 75119, 61499, 61549, 61457, 61523, 62042, 68536, 63834, 62331…
## $ county   <chr> "Alameda", "Butte", "Butte", "Butte", "Butte", "Fresno", "San…
## $ district <chr> "Sunol Glen Unified", "Manzanita Elementary", "Thermalito Uni…
## $ gr_span  <chr> "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08"…
## $ enrl_tot <dbl> 195, 240, 1550, 243, 1335, 137, 195, 888, 379, 2247, 446, 987…
## $ teachers <dbl> 10.90, 11.15, 82.90, 14.00, 71.50, 6.40, 10.00, 42.50, 19.00,…
## $ calw_pct <dbl> 0.5102, 15.4167, 55.0323, 36.4754, 33.1086, 12.3188, 12.9032,…
## $ meal_pct <dbl> 2.0408, 47.9167, 76.3226, 77.0492, 78.4270, 86.9565, 94.6237,…
## $ computer <dbl> 67, 101, 169, 85, 171, 25, 28, 66, 35, 0, 86, 56, 25, 0, 31, …
## $ testscr  <dbl> 690.80, 661.20, 643.60, 647.70, 640.85, 605.55, 606.75, 609.0…
## $ comp_stu <dbl> 0.34358975, 0.42083332, 0.10903226, 0.34979424, 0.12808989, 0…
## $ expn_stu <dbl> 6384.911, 5099.381, 5501.955, 7101.831, 5235.988, 5580.147, 5…
## $ str      <dbl> 17.88991, 21.52466, 18.69723, 17.35714, 18.67133, 21.40625, 1…
## $ avginc   <dbl> 22.690001, 9.824000, 8.978000, 8.978000, 9.080333, 10.415000,…
## $ el_pct   <dbl> 0.000000, 4.583333, 30.000002, 0.000000, 13.857677, 12.408759…
## $ read_scr <dbl> 691.6, 660.5, 636.3, 651.9, 641.8, 605.7, 604.5, 605.5, 608.9…
## $ math_scr <dbl> 690.0, 661.9, 650.9, 643.5, 639.9, 605.4, 609.0, 612.5, 616.1…
## $ aowijef  <dbl> 35.77982, 43.04933, 37.39445, 34.71429, 37.34266, 42.81250, 3…
## $ es_pct   <dbl> 1.000000, 3.583333, 29.000002, 1.000000, 12.857677, 11.408759…
## $ es_frac  <dbl> 0.01000000, 0.03583334, 0.29000002, 0.01000000, 0.12857677, 0…

Scatterplot

# save as scatter
scatter <- ggplot(data = CASchool)+
  aes(x = str,
      y = testscr)+
  geom_point(color = "blue")+
  labs(x = "Student to Teacher Ratio",
       y = "Test Score")+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size = 20)

# look at it 
scatter

Simple Linear Regression (1 Variable)

# run simple regression, save as "reg"
reg <- lm(testscr ~ str, data = CASchool)

# print it (just gives us the coefficients)
reg

## 
## Call:
## lm(formula = testscr ~ str, data = CASchool)
## 
## Coefficients:
## (Intercept)          str  
##      698.93        -2.28

# get full summary
summary(reg)

## 
## Call:
## lm(formula = testscr ~ str, data = CASchool)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.727 -14.251   0.483  12.822  48.540 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 698.9330     9.4675  73.825  < 2e-16 ***
## str          -2.2798     0.4798  -4.751 2.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.58 on 418 degrees of freedom
## Multiple R-squared:  0.05124,    Adjusted R-squared:  0.04897 
## F-statistic: 22.58 on 1 and 418 DF,  p-value: 2.783e-06

# Note: this also works (tidyverse): 
# reg %>% summary()

# use broom for tidier output

# tidy() gives us coefficients
reg %>% tidy()

term	estimate	std.error	statistic	p.value
(Intercept)	699	9.47	73.8	6.57e-242
str	-2.28	0.48	-4.75	2.78e-06

# glance() gives us regression fit statistics
reg %>% glance()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.0512	0.049	18.6	22.6	2.78e-06	1	-1.82e+03	3.65e+03	3.66e+03	1.44e+05	418	420

# augment() creates observations for fitted and residual values (among other things)
reg %>% augment() %>% head() # I added head() because otherwise it will print 420 rows!!

testscr	str	.fitted	.resid	.hat	.sigma	.cooksd	.std.resid
691	17.9	658	32.7	0.00442	18.5	0.00689	1.76
661	21.5	650	11.3	0.00475	18.6	0.000893	0.612
644	18.7	656	-12.7	0.00297	18.6	0.0007	-0.685
648	17.4	659	-11.7	0.00586	18.6	0.00117	-0.629
641	18.7	656	-15.5	0.00301	18.6	0.00105	-0.836
606	21.4	650	-44.6	0.00446	18.5	0.013	-2.4

You can type out the regression equation in markdown using LaTeX math code:

Or you can try out equatiomatic:

# be sure to set chunk option to: results="asis"

extract_eq(reg, # regression lm object
           use_coefs = TRUE, # use names of variables
           coef_digits = 2, # round to 2 digits
           fix_signs = TRUE) # fix negatives (instead of + -)

Now let’s go back to the scatterplot and add our regression line!

scatter+
  geom_smooth(method = "lm", # at minimum, must set method to linear model
              color = "red", # change color
              se = TRUE) # set to FALSE if you want to hide the SE range (gray)

## `geom_smooth()` using formula 'y ~ x'

Now let’s print a nice regression output table

huxreg(reg)

	(1)
(Intercept)	698.933 ***
	(9.467)
str	-2.280 ***
	(0.480)
N	420
R2	0.051
logLik	-1822.250
AIC	3650.499
* p < 0.001; p < 0.01; * p < 0.05.

We can customize it a bit

huxreg("Test Score" = reg, # set Y name
       coefs = c("Intercept" = "(Intercept)", # rename X variable & intercept
                 "STR" = "str"),
       statistics = c("N" = "nobs", # show only these stats and name them
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2) # round to 2 decimals

	Test Score
Intercept	698.93 ***
	(9.47)
STR	-2.28 ***
	(0.48)
N	420
R-Squared	0.05
SER	18.58
* p < 0.001; p < 0.01; * p < 0.05.

Regression Analysis

# first let's use augment again, and *save* as aug_reg
aug_reg <- augment(reg)

# now we can plot a histogram using aug_reg

ggplot(data = aug_reg)+
  aes(x = .resid)+ # the residuals!
  geom_histogram(color = "white",
                 fill = "hotpink")+
  labs(x = expression(paste("Residual, ", hat(u))))+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size=20)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = aug_reg)+
  aes(x = str,
      y = .resid)+
  geom_point(color="blue")+
  geom_hline(aes(yintercept = 0), color="red")+
  labs(x = "Student to Teacher Ratio",
       y = expression(paste("Residual, ", hat(u))))+
  theme_pander(base_family = "Fira Sans Condensed",
           base_size=20)

Testing for heteroskedasticity

# using package lmtest
bptest(reg)

## 
##  studentized Breusch-Pagan test
## 
## data:  reg
## BP = 5.7936, df = 1, p-value = 0.01608

# using package estimatr
reg_robust <-lm_robust(testscr ~ str,
                              data = CASchool,
                              se_type = "stata")

reg_robust %>% summary()

## 
## Call:
## lm_robust(formula = testscr ~ str, data = CASchool, se_type = "stata")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper  DF
## (Intercept)   698.93    10.3644  67.436 9.487e-227  678.560  719.306 418
## str            -2.28     0.5195  -4.389  1.447e-05   -3.301   -1.259 418
## 
## Multiple R-squared:  0.05124 ,   Adjusted R-squared:  0.04897 
## F-statistic: 19.26 on 1 and 418 DF,  p-value: 1.447e-05

huxreg("Normal" = reg,
       "Robust" = reg_robust,
       coefs = c("Intercept" = "(Intercept)",
                 "STR" = "str"),
       statistics = c("N" = "nobs",
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2)

	Normal	Robust
Intercept	698.93 ***	698.93 ***
	(9.47)	(10.36)
STR	-2.28 ***	-2.28 ***
	(0.48)	(0.52)
N	420	420
R-Squared	0.05	0.05
SER	18.58
* p < 0.001; p < 0.01; * p < 0.05.

Check for outliers

# using car package
outlierTest(reg) # will point out which obs #s seem outliers

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 417 2.636833           0.008681           NA

Multivariate Regressions

Correlation Table

CASchool %>%
  select("str","testscr","el_pct", "expn_stu") %>%
  cor()

##                 str    testscr      el_pct    expn_stu
## str       1.0000000 -0.2263628  0.18764237 -0.61998215
## testscr  -0.2263628  1.0000000 -0.64412374  0.19127277
## el_pct    0.1876424 -0.6441237  1.00000000 -0.07139604
## expn_stu -0.6199821  0.1912728 -0.07139604  1.00000000

Conditional Distribution (of Test Score) by Low/High ESL

# make a new variable called EL
# = high (if el_pct is above median) or = low (if below median)
CASchool<-CASchool %>% # next we create a new dummy variable called ESL
  mutate(ESL = ifelse(el_pct > median(el_pct), # test if ESL is above median
                     yes = "High ESL", # if yes, call this variable "High ESL"
                     no = "Low ESL")) # if no, call this variable "Low ESL"

# get average test score by high/low EL
CASchool %>%
  group_by(ESL) %>%
  summarize(Average_test_score=mean(testscr))

ESL	Average_test_score
High ESL	644
Low ESL	664

# make distributions
ggplot(data = CASchool)+
  aes(x = testscr,
      fill = ESL)+
  geom_density(alpha=0.5)+
  labs(x = "Test Score",
       y = "Density")+
  ggthemes::theme_pander(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")

# make scatterplot

esl_scatter<-ggplot(data = CASchool)+
  aes(x = str,
      y = testscr,
      color = ESL)+
  geom_point()+
  geom_smooth(method="lm")+
  labs(x = "STR",
       y = "Test Score")+
  ggthemes::theme_pander(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")
esl_scatter

## `geom_smooth()` using formula 'y ~ x'

# facet instead

esl_scatter+
  facet_grid(~ESL)+ #<<
  guides(color = F) #<<

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## `geom_smooth()` using formula 'y ~ x'

Multivariate Regression (with %EL)

# run regression of testscr on str and el_pct
school_reg_2 <- lm(testscr ~ str + el_pct, 
                 data = CASchool)

# look at it (just gives coefficients)
school_reg_2

## 
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
## 
## Coefficients:
## (Intercept)          str       el_pct  
##    686.0322      -1.1013      -0.6498

# get summary
school_reg_2 %>% summary()

## 
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.845 -10.240  -0.308   9.815  43.461 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 686.03225    7.41131  92.566  < 2e-16 ***
## str          -1.10130    0.38028  -2.896  0.00398 ** 
## el_pct       -0.64978    0.03934 -16.516  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.46 on 417 degrees of freedom
## Multiple R-squared:  0.4264, Adjusted R-squared:  0.4237 
## F-statistic:   155 on 2 and 417 DF,  p-value: < 2.2e-16

# do it tider with broom
school_reg_2 %>% tidy()

term	estimate	std.error	statistic	p.value
(Intercept)	686	7.41	92.6	3.87e-280
str	-1.1	0.38	-2.9	0.00398
el_pct	-0.65	0.0393	-16.5	1.66e-47

# get regression fit statistics
school_reg_2 %>% glance()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.426	0.424	14.5	155	4.62e-51	2	-1.72e+03	3.44e+03	3.46e+03	8.72e+04	417	420

# make a regression output table
library(huxtable)
huxreg("Model 1" = reg,
       "Model 2" = school_reg_2,
       coefs = c("Intercept" = "(Intercept)",
                 "Class Size" = "str",
                 "%ESL Students" = "el_pct"),
       statistics = c("N" = "nobs",
                      "R-Squared" = "r.squared",
                      "SER" = "sigma"),
       number_format = 2)

	Model 1	Model 2
Intercept	698.93 ***	686.03 ***
	(9.47)	(7.41)
Class Size	-2.28 ***	-1.10 **
	(0.48)	(0.38)
%ESL Students		-0.65 ***
		(0.04)
N	420	420
R-Squared	0.05	0.43
SER	18.58	14.46
* p < 0.001; p < 0.01; * p < 0.05.

Measuring OVB

# "true" regression (str and %EL)
true<-lm(testscr~str+el_pct, data=CASchool)
true %>% tidy()

term	estimate	std.error	statistic	p.value
(Intercept)	686	7.41	92.6	3.87e-280
str	-1.1	0.38	-2.9	0.00398
el_pct	-0.65	0.0393	-16.5	1.66e-47

# "omitted" regression (omitting %EL)
omitted<-lm(testscr~str, data=CASchool)
omitted %>% tidy()

term	estimate	std.error	statistic	p.value
(Intercept)	699	9.47	73.8	6.57e-242
str	-2.28	0.48	-4.75	2.78e-06

# "auxiliary" regression (%EL on str)
aux<-lm(el_pct~str, data=CASchool)
aux %>% tidy()

term	estimate	std.error	statistic	p.value
(Intercept)	-19.9	9.16	-2.17	0.0308
str	1.81	0.464	3.91	0.00011

Variance and VIF

library(car) # needs car package
true %>% vif()

##      str   el_pct 
## 1.036495 1.036495

# calculate VIF manually

# look at aux reg stats for R^2
# extract our R-squared from aux regression (R_j^2)
aux_r_sq<-glance(aux) %>%
  select(r.squared)

aux_r_sq # look at it

r.squared
0.0352

# calculate VIF manually

our_vif<-1/(1-aux_r_sq) # VIF formula 

our_vif

r.squared
1.04

Class Size & Test Scores Regression Analysis