Loading In Data
CASchool <- read_dta("caschool.dta") # this is in the same directory as this Rmd file
# see what it gives us
# CASchool # don't run this in knitted markdown, it will print 420 rows!!
# note since we have tidyverse loaded, it prints a nice tibble
# look only at first 10 rows
head(CASchool, n = 10)
observat | dist_cod | county | district | gr_span | enrl_tot | teachers | calw_pct | meal_pct | computer | testscr | comp_stu | expn_stu | str | avginc | el_pct | read_scr | math_scr | aowijef | es_pct | es_frac |
1 | 7.51e+04 | Alameda | Sunol Glen Unified | KK-08 | 195 | 10.9 | 0.51 | 2.04 | 67 | 691 | 0.344 | 6.38e+03 | 17.9 | 22.7 | 0 | 692 | 690 | 35.8 | 1 | 0.01 |
2 | 6.15e+04 | Butte | Manzanita Elementary | KK-08 | 240 | 11.1 | 15.4 | 47.9 | 101 | 661 | 0.421 | 5.1e+03 | 21.5 | 9.82 | 4.58 | 660 | 662 | 43 | 3.58 | 0.0358 |
3 | 6.15e+04 | Butte | Thermalito Union Elementary | KK-08 | 1.55e+03 | 82.9 | 55 | 76.3 | 169 | 644 | 0.109 | 5.5e+03 | 18.7 | 8.98 | 30 | 636 | 651 | 37.4 | 29 | 0.29 |
4 | 6.15e+04 | Butte | Golden Feather Union Elementary | KK-08 | 243 | 14 | 36.5 | 77 | 85 | 648 | 0.35 | 7.1e+03 | 17.4 | 8.98 | 0 | 652 | 644 | 34.7 | 1 | 0.01 |
5 | 6.15e+04 | Butte | Palermo Union Elementary | KK-08 | 1.34e+03 | 71.5 | 33.1 | 78.4 | 171 | 641 | 0.128 | 5.24e+03 | 18.7 | 9.08 | 13.9 | 642 | 640 | 37.3 | 12.9 | 0.129 |
6 | 6.2e+04 | Fresno | Burrel Union Elementary | KK-08 | 137 | 6.4 | 12.3 | 87 | 25 | 606 | 0.182 | 5.58e+03 | 21.4 | 10.4 | 12.4 | 606 | 605 | 42.8 | 11.4 | 0.114 |
7 | 6.85e+04 | San Joaquin | Holt Union Elementary | KK-08 | 195 | 10 | 12.9 | 94.6 | 28 | 607 | 0.144 | 5.25e+03 | 19.5 | 6.58 | 68.7 | 604 | 609 | 39 | 67.7 | 0.677 |
8 | 6.38e+04 | Kern | Vineland Elementary | KK-08 | 888 | 42.5 | 18.8 | 100 | 66 | 609 | 0.0743 | 4.57e+03 | 20.9 | 8.17 | 47 | 606 | 612 | 41.8 | 46 | 0.46 |
9 | 6.23e+04 | Fresno | Orange Center Elementary | KK-08 | 379 | 19 | 32.2 | 93.1 | 35 | 612 | 0.0923 | 5.36e+03 | 19.9 | 7.39 | 30.1 | 609 | 616 | 39.9 | 29.1 | 0.291 |
10 | 6.73e+04 | Sacramento | Del Paso Heights Elementary | KK-06 | 2.25e+03 | 108 | 79 | 87.3 | 0 | 613 | 0 | 5.04e+03 | 20.8 | 11.6 | 40.3 | 612 | 613 | 41.6 | 39.3 | 0.393 |
# get structure
str(CASchool)
## tibble [420 × 21] (S3: tbl_df/tbl/data.frame)
## $ observat: num [1:420] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ dist_cod: num [1:420] 75119 61499 61549 61457 61523 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ county : chr [1:420] "Alameda" "Butte" "Butte" "Butte" ...
## ..- attr(*, "format.stata")= chr "%18s"
## $ district: chr [1:420] "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
## ..- attr(*, "format.stata")= chr "%53s"
## $ gr_span : chr [1:420] "KK-08" "KK-08" "KK-08" "KK-08" ...
## ..- attr(*, "format.stata")= chr "%8s"
## $ enrl_tot: num [1:420] 195 240 1550 243 1335 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ teachers: num [1:420] 10.9 11.1 82.9 14 71.5 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ calw_pct: num [1:420] 0.51 15.42 55.03 36.48 33.11 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ meal_pct: num [1:420] 2.04 47.92 76.32 77.05 78.43 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ computer: num [1:420] 67 101 169 85 171 25 28 66 35 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ testscr : num [1:420] 691 661 644 648 641 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ comp_stu: num [1:420] 0.344 0.421 0.109 0.35 0.128 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ expn_stu: num [1:420] 6385 5099 5502 7102 5236 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ str : num [1:420] 17.9 21.5 18.7 17.4 18.7 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ avginc : num [1:420] 22.69 9.82 8.98 8.98 9.08 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ el_pct : num [1:420] 0 4.58 30 0 13.86 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ read_scr: num [1:420] 692 660 636 652 642 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ math_scr: num [1:420] 690 662 651 644 640 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ aowijef : num [1:420] 35.8 43 37.4 34.7 37.3 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ es_pct : num [1:420] 1 3.58 29 1 12.86 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ es_frac : num [1:420] 0.01 0.0358 0.29 0.01 0.1286 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
# use tidyverse methods to look at structure
glimpse(CASchool)
## Rows: 420
## Columns: 21
## $ observat <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ dist_cod <dbl> 75119, 61499, 61549, 61457, 61523, 62042, 68536, 63834, 62331…
## $ county <chr> "Alameda", "Butte", "Butte", "Butte", "Butte", "Fresno", "San…
## $ district <chr> "Sunol Glen Unified", "Manzanita Elementary", "Thermalito Uni…
## $ gr_span <chr> "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08"…
## $ enrl_tot <dbl> 195, 240, 1550, 243, 1335, 137, 195, 888, 379, 2247, 446, 987…
## $ teachers <dbl> 10.90, 11.15, 82.90, 14.00, 71.50, 6.40, 10.00, 42.50, 19.00,…
## $ calw_pct <dbl> 0.5102, 15.4167, 55.0323, 36.4754, 33.1086, 12.3188, 12.9032,…
## $ meal_pct <dbl> 2.0408, 47.9167, 76.3226, 77.0492, 78.4270, 86.9565, 94.6237,…
## $ computer <dbl> 67, 101, 169, 85, 171, 25, 28, 66, 35, 0, 86, 56, 25, 0, 31, …
## $ testscr <dbl> 690.80, 661.20, 643.60, 647.70, 640.85, 605.55, 606.75, 609.0…
## $ comp_stu <dbl> 0.34358975, 0.42083332, 0.10903226, 0.34979424, 0.12808989, 0…
## $ expn_stu <dbl> 6384.911, 5099.381, 5501.955, 7101.831, 5235.988, 5580.147, 5…
## $ str <dbl> 17.88991, 21.52466, 18.69723, 17.35714, 18.67133, 21.40625, 1…
## $ avginc <dbl> 22.690001, 9.824000, 8.978000, 8.978000, 9.080333, 10.415000,…
## $ el_pct <dbl> 0.000000, 4.583333, 30.000002, 0.000000, 13.857677, 12.408759…
## $ read_scr <dbl> 691.6, 660.5, 636.3, 651.9, 641.8, 605.7, 604.5, 605.5, 608.9…
## $ math_scr <dbl> 690.0, 661.9, 650.9, 643.5, 639.9, 605.4, 609.0, 612.5, 616.1…
## $ aowijef <dbl> 35.77982, 43.04933, 37.39445, 34.71429, 37.34266, 42.81250, 3…
## $ es_pct <dbl> 1.000000, 3.583333, 29.000002, 1.000000, 12.857677, 11.408759…
## $ es_frac <dbl> 0.01000000, 0.03583334, 0.29000002, 0.01000000, 0.12857677, 0…
Scatterplot
# save as scatter
scatter <- ggplot(data = CASchool)+
aes(x = str,
y = testscr)+
geom_point(color = "blue")+
labs(x = "Student to Teacher Ratio",
y = "Test Score")+
theme_pander(base_family = "Fira Sans Condensed",
base_size = 20)
# look at it
scatter
Simple Linear Regression (1 Variable)
# run simple regression, save as "reg"
reg <- lm(testscr ~ str, data = CASchool)
# print it (just gives us the coefficients)
reg
##
## Call:
## lm(formula = testscr ~ str, data = CASchool)
##
## Coefficients:
## (Intercept) str
## 698.93 -2.28
# get full summary
summary(reg)
##
## Call:
## lm(formula = testscr ~ str, data = CASchool)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.727 -14.251 0.483 12.822 48.540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 698.9330 9.4675 73.825 < 2e-16 ***
## str -2.2798 0.4798 -4.751 2.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.58 on 418 degrees of freedom
## Multiple R-squared: 0.05124, Adjusted R-squared: 0.04897
## F-statistic: 22.58 on 1 and 418 DF, p-value: 2.783e-06
# Note: this also works (tidyverse):
# reg %>% summary()
# use broom for tidier output
# tidy() gives us coefficients
reg %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | 699 | 9.47 | 73.8 | 6.57e-242 |
str | -2.28 | 0.48 | -4.75 | 2.78e-06 |
# glance() gives us regression fit statistics
reg %>% glance()
r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
0.0512 | 0.049 | 18.6 | 22.6 | 2.78e-06 | 1 | -1.82e+03 | 3.65e+03 | 3.66e+03 | 1.44e+05 | 418 | 420 |
# augment() creates observations for fitted and residual values (among other things)
reg %>% augment() %>% head() # I added head() because otherwise it will print 420 rows!!
testscr | str | .fitted | .resid | .hat | .sigma | .cooksd | .std.resid |
691 | 17.9 | 658 | 32.7 | 0.00442 | 18.5 | 0.00689 | 1.76 |
661 | 21.5 | 650 | 11.3 | 0.00475 | 18.6 | 0.000893 | 0.612 |
644 | 18.7 | 656 | -12.7 | 0.00297 | 18.6 | 0.0007 | -0.685 |
648 | 17.4 | 659 | -11.7 | 0.00586 | 18.6 | 0.00117 | -0.629 |
641 | 18.7 | 656 | -15.5 | 0.00301 | 18.6 | 0.00105 | -0.836 |
606 | 21.4 | 650 | -44.6 | 0.00446 | 18.5 | 0.013 | -2.4 |
You can type out the regression equation in markdown using LaTeX math code:
\[\widehat{\text{test score}}=689.9-2.28 \times str\]
Or you can try out equatiomatic
:
# be sure to set chunk option to: results="asis"
extract_eq(reg, # regression lm object
use_coefs = TRUE, # use names of variables
coef_digits = 2, # round to 2 digits
fix_signs = TRUE) # fix negatives (instead of + -)
\[
\operatorname{\widehat{testscr}} = 698.93 - 2.28(\operatorname{str})
\]
Now let’s go back to the scatterplot and add our regression line!
scatter+
geom_smooth(method = "lm", # at minimum, must set method to linear model
color = "red", # change color
se = TRUE) # set to FALSE if you want to hide the SE range (gray)
## `geom_smooth()` using formula 'y ~ x'
Now let’s print a nice regression output table
huxreg(reg)
| (1) |
(Intercept) | 698.933 *** |
| (9.467) |
str | -2.280 *** |
| (0.480) |
N | 420 |
R2 | 0.051 |
logLik | -1822.250 |
AIC | 3650.499 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
We can customize it a bit
huxreg("Test Score" = reg, # set Y name
coefs = c("Intercept" = "(Intercept)", # rename X variable & intercept
"STR" = "str"),
statistics = c("N" = "nobs", # show only these stats and name them
"R-Squared" = "r.squared",
"SER" = "sigma"),
number_format = 2) # round to 2 decimals
| Test Score |
Intercept | 698.93 *** |
| (9.47) |
STR | -2.28 *** |
| (0.48) |
N | 420 |
R-Squared | 0.05 |
SER | 18.58 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Regression Analysis
# first let's use augment again, and *save* as aug_reg
aug_reg <- augment(reg)
# now we can plot a histogram using aug_reg
ggplot(data = aug_reg)+
aes(x = .resid)+ # the residuals!
geom_histogram(color = "white",
fill = "hotpink")+
labs(x = expression(paste("Residual, ", hat(u))))+
theme_pander(base_family = "Fira Sans Condensed",
base_size=20)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = aug_reg)+
aes(x = str,
y = .resid)+
geom_point(color="blue")+
geom_hline(aes(yintercept = 0), color="red")+
labs(x = "Student to Teacher Ratio",
y = expression(paste("Residual, ", hat(u))))+
theme_pander(base_family = "Fira Sans Condensed",
base_size=20)
Testing for heteroskedasticity
# using package lmtest
bptest(reg)
##
## studentized Breusch-Pagan test
##
## data: reg
## BP = 5.7936, df = 1, p-value = 0.01608
# using package estimatr
reg_robust <-lm_robust(testscr ~ str,
data = CASchool,
se_type = "stata")
reg_robust %>% summary()
##
## Call:
## lm_robust(formula = testscr ~ str, data = CASchool, se_type = "stata")
##
## Standard error type: HC1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 698.93 10.3644 67.436 9.487e-227 678.560 719.306 418
## str -2.28 0.5195 -4.389 1.447e-05 -3.301 -1.259 418
##
## Multiple R-squared: 0.05124 , Adjusted R-squared: 0.04897
## F-statistic: 19.26 on 1 and 418 DF, p-value: 1.447e-05
huxreg("Normal" = reg,
"Robust" = reg_robust,
coefs = c("Intercept" = "(Intercept)",
"STR" = "str"),
statistics = c("N" = "nobs",
"R-Squared" = "r.squared",
"SER" = "sigma"),
number_format = 2)
| Normal | Robust |
Intercept | 698.93 *** | 698.93 *** |
| (9.47) | (10.36) |
STR | -2.28 *** | -2.28 *** |
| (0.48) | (0.52) |
N | 420 | 420 |
R-Squared | 0.05 | 0.05 |
SER | 18.58 | |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Check for outliers
# using car package
outlierTest(reg) # will point out which obs #s seem outliers
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 417 2.636833 0.008681 NA
Multivariate Regressions
Correlation Table
CASchool %>%
select("str","testscr","el_pct", "expn_stu") %>%
cor()
## str testscr el_pct expn_stu
## str 1.0000000 -0.2263628 0.18764237 -0.61998215
## testscr -0.2263628 1.0000000 -0.64412374 0.19127277
## el_pct 0.1876424 -0.6441237 1.00000000 -0.07139604
## expn_stu -0.6199821 0.1912728 -0.07139604 1.00000000
Conditional Distribution (of Test Score) by Low/High ESL
# make a new variable called EL
# = high (if el_pct is above median) or = low (if below median)
CASchool<-CASchool %>% # next we create a new dummy variable called ESL
mutate(ESL = ifelse(el_pct > median(el_pct), # test if ESL is above median
yes = "High ESL", # if yes, call this variable "High ESL"
no = "Low ESL")) # if no, call this variable "Low ESL"
# get average test score by high/low EL
CASchool %>%
group_by(ESL) %>%
summarize(Average_test_score=mean(testscr))
ESL | Average_test_score |
High ESL | 644 |
Low ESL | 664 |
# make distributions
ggplot(data = CASchool)+
aes(x = testscr,
fill = ESL)+
geom_density(alpha=0.5)+
labs(x = "Test Score",
y = "Density")+
ggthemes::theme_pander(
base_family = "Fira Sans Condensed",
base_size=20
)+
theme(legend.position = "bottom")
# make scatterplot
esl_scatter<-ggplot(data = CASchool)+
aes(x = str,
y = testscr,
color = ESL)+
geom_point()+
geom_smooth(method="lm")+
labs(x = "STR",
y = "Test Score")+
ggthemes::theme_pander(
base_family = "Fira Sans Condensed",
base_size=20
)+
theme(legend.position = "bottom")
esl_scatter
## `geom_smooth()` using formula 'y ~ x'
# facet instead
esl_scatter+
facet_grid(~ESL)+ #<<
guides(color = F) #<<
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## `geom_smooth()` using formula 'y ~ x'
Multivariate Regression (with %EL)
# run regression of testscr on str and el_pct
school_reg_2 <- lm(testscr ~ str + el_pct,
data = CASchool)
# look at it (just gives coefficients)
school_reg_2
##
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
##
## Coefficients:
## (Intercept) str el_pct
## 686.0322 -1.1013 -0.6498
# get summary
school_reg_2 %>% summary()
##
## Call:
## lm(formula = testscr ~ str + el_pct, data = CASchool)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.845 -10.240 -0.308 9.815 43.461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 686.03225 7.41131 92.566 < 2e-16 ***
## str -1.10130 0.38028 -2.896 0.00398 **
## el_pct -0.64978 0.03934 -16.516 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.46 on 417 degrees of freedom
## Multiple R-squared: 0.4264, Adjusted R-squared: 0.4237
## F-statistic: 155 on 2 and 417 DF, p-value: < 2.2e-16
# do it tider with broom
school_reg_2 %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | 686 | 7.41 | 92.6 | 3.87e-280 |
str | -1.1 | 0.38 | -2.9 | 0.00398 |
el_pct | -0.65 | 0.0393 | -16.5 | 1.66e-47 |
# get regression fit statistics
school_reg_2 %>% glance()
r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
0.426 | 0.424 | 14.5 | 155 | 4.62e-51 | 2 | -1.72e+03 | 3.44e+03 | 3.46e+03 | 8.72e+04 | 417 | 420 |
# make a regression output table
library(huxtable)
huxreg("Model 1" = reg,
"Model 2" = school_reg_2,
coefs = c("Intercept" = "(Intercept)",
"Class Size" = "str",
"%ESL Students" = "el_pct"),
statistics = c("N" = "nobs",
"R-Squared" = "r.squared",
"SER" = "sigma"),
number_format = 2)
| Model 1 | Model 2 |
Intercept | 698.93 *** | 686.03 *** |
| (9.47) | (7.41) |
Class Size | -2.28 *** | -1.10 ** |
| (0.48) | (0.38) |
%ESL Students | | -0.65 *** |
| | (0.04) |
N | 420 | 420 |
R-Squared | 0.05 | 0.43 |
SER | 18.58 | 14.46 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Measuring OVB
# "true" regression (str and %EL)
true<-lm(testscr~str+el_pct, data=CASchool)
true %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | 686 | 7.41 | 92.6 | 3.87e-280 |
str | -1.1 | 0.38 | -2.9 | 0.00398 |
el_pct | -0.65 | 0.0393 | -16.5 | 1.66e-47 |
# "omitted" regression (omitting %EL)
omitted<-lm(testscr~str, data=CASchool)
omitted %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | 699 | 9.47 | 73.8 | 6.57e-242 |
str | -2.28 | 0.48 | -4.75 | 2.78e-06 |
# "auxiliary" regression (%EL on str)
aux<-lm(el_pct~str, data=CASchool)
aux %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | -19.9 | 9.16 | -2.17 | 0.0308 |
str | 1.81 | 0.464 | 3.91 | 0.00011 |
Variance and VIF
library(car) # needs car package
true %>% vif()
## str el_pct
## 1.036495 1.036495
# calculate VIF manually
# look at aux reg stats for R^2
# extract our R-squared from aux regression (R_j^2)
aux_r_sq<-glance(aux) %>%
select(r.squared)
aux_r_sq # look at it
# calculate VIF manually
our_vif<-1/(1-aux_r_sq) # VIF formula
our_vif