# sat_act <- sat.act %>%
# mutate(., gender_fac = ifelse(gender == 1, "male", "female")) %>%
# mutate(.,
# gender_fac = case_when(gender == 1 ~ "male",
# gender == 2 ~ "female"),
# education_fac = case_when(education == 0 ~ "none",
# education == 1 ~ "some_hs",
# education == 2 ~ "high_school",
# education == 3 ~ "some_college",
# education == 4 ~ "college",
# education == 5 ~ "graduate")
# )
data("sat.act", package="psych")
sat.act <- sat.act %>% dplyr::rename(sex=gender) %>%
mutate(sex = factor(sex, levels = c(1,2), labels = c("male", "female")))
vcd::mosaic(~ sex+education, data = sat.act, shade = TRUE, legend = TRUE)
The Pearson product moment correlation seeks to measure the linear association between two variables, \(x\) and \(y\) on a standardized scale ranging from \(r = -1 -- 1\).
The correlation of x and y is a covariance that has been standardized by the standard deviations of \(x\) and \(y\). This yields a scale-insensitive measure of the linear association of \(x\) and \(y\). For much more conceptual detail, see: https://psu-psychology.github.io/psy-597-SEM/01_correlation_regression/01_Correlation_and_Regression.html.
\[r_{XY}= \frac{s_{XY}}{s_{X} s_{Y}}\]
To visualize this relationship, we can pass the raw data to ggplot
and get a simple regression line using stat_smooth()
with the lm
method.
ggplot(sat.act, aes(x=age, y=ACT)) +
geom_jitter(width=0.1) +
stat_smooth(method="lm", se=FALSE)
lm_SAT <- lm(SATV ~ SATQ, data=sat.act)
In simple regression, we are interested in a relationship of the form: \[ Y = B_0 + B_1 X \] where \(Y\) is the dependent variable (criterion) and \(X\) is the predictor (covariate). The intercept is represented by \(B0\) and the slope for the \(X\) predictor by \(B1\).
When conducting regression, we typically try to capture linear relationships among variables. We can introduce higher-order polynomial terms (e.g., quadratic models) or splines (more flexible shapes), but this beyond the scope here.
We can also get useful diagnostic plots for free using the plot()
function:
plot(lm_SAT)
Nonparametric bootstrapping approximates the sampling distribution for a statistic of interest (e.g., a slope) by resampling the existing data with replacement many times and examining the resulting density.
It appears that these are both influential predictors. We could examine the relationship graphically.
ggplot(sat.act, aes(x=SATV, y=SATQ, color=sex)) +
geom_point() +
scale_color_discrete(name="Sex", breaks = c("1", "2"), labels = c("Male", "Female")) +
labs(title = "Multiple Regression", x = "SATV", y = "SATQ") +
stat_smooth(method=lm, se=FALSE)
# 2-way interation plotting
ggplot(data=sat.act,
aes(x=age, y=SATQ, colour = sex)) +
geom_jitter()+
labs(x = "Age", y = "Math Score", colour = "Sex") +
theme_bw()+
theme(
plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.grid.minor = element_blank()
,panel.border = element_blank()
) +
stat_smooth(method='lm', se=TRUE, fullrange=TRUE) +
scale_color_manual(labels = c("Male", "Female"), values = c("#d21959", "#4aded3"))
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14))
## List of 2
## $ axis.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 12
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
The emmeans
package also provides useful plots to understand pairwise differences:
sat.act$agef[sat.act$age < 25] <- 1
sat.act$agef[sat.act$age >= 25 & sat.act$age <= 50] <- 2
sat.act$agef[sat.act$age > 50] <- 3
# setting as factors
sat.act$agef <- as.factor(sat.act$agef)
sat.act$gender <- as.factor(sat.act$sex)
# running the model
sat.lm <- lm(SATQ ~ agef + SATV, data = sat.act)
sat.emm.s <- emmeans(sat.lm, "agef")
plot(sat.emm.s, comparisons = TRUE)
The blue bars are confidence intervals for the EMMs, and the red arrows are for the comparisons among them. If an arrow from one mean overlaps an arrow from another group, the difference is not significant, based on the adjust setting (which defaults to “tukey”). (Note: Don’t ever use confidence intervals for EMMs to perform comparisons; they can be very misleading.)
We can visualize the interaction as follows:
ggplot(data=sat.act,
aes(x=SATV, y=SATQ, colour = factor(agef))) +
geom_jitter()+
labs(x = "Verbal Score", y = "Math Score", colour = "Age") +
theme_bw()+
theme(
plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.grid.minor = element_blank()
,panel.border = element_blank()
) +
stat_smooth(method='lm', se=FALSE, fullrange=TRUE) +
scale_color_manual(labels = c("Under 25", "25-50", "Over 50"), values = c("red", "green", "blue"))
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14))
## List of 2
## $ axis.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 12
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
In the sat.act dataset, we have treated age as a factor. If we keep this representation (as opposed to entering age as continuous), we can easily get orthogonal polynomial contrasts in emmeans
. For example, is the effect of age linearly related to SATV, or might it be quadratic or cubic?
sat.emm.p <- emmeans(sat.lm, "agef")
ply <- contrast(sat.emm.p, "poly")
ply
## contrast estimate SE df t.ratio p.value
## linear 15.5 19.0 683 0.819 0.4130
## quadratic 15.7 22.1 683 0.712 0.4770
coef(ply) #show the contrast coefficients
## agef c.1 c.2
## 1 1 -1 1
## 2 2 0 -2
## 3 3 1 1
Finally, we can examine effects in multivariate regression models (i.e., multiple DVs). Here, we can examine the both the verbal and math scores and if they are associated with age, gender (sex), education, or ACT scores.
org.int <- lm(cbind(SATV, SATQ) ~ age + gender + education + ACT, data = sat.act)
summary(org.int)
## Response SATV :
##
## Call:
## lm(formula = SATV ~ age + gender + education + ACT, data = sat.act)
##
## Residuals:
## Min 1Q Median 3Q Max
## -502.0 -47.9 7.6 60.1 239.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 261.2947 23.3033 11.21 <2e-16 ***
## age -1.3893 0.4537 -3.06 0.0023 **
## genderfemale -0.0713 7.5002 -0.01 0.9924
## education 1.4926 3.0573 0.49 0.6256
## ACT 13.3790 0.7487 17.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.3 on 682 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.326, Adjusted R-squared: 0.322
## F-statistic: 82.3 on 4 and 682 DF, p-value: <2e-16
##
##
## Response SATQ :
##
## Call:
## lm(formula = SATQ ~ age + gender + education + ACT, data = sat.act)
##
## Residuals:
## Min 1Q Median 3Q Max
## -492.5 -48.0 8.2 63.3 257.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 259.647 22.891 11.34 < 2e-16 ***
## age -1.352 0.446 -3.03 0.0025 **
## genderfemale -34.805 7.367 -4.72 2.8e-06 ***
## education 1.102 3.003 0.37 0.7138
## ACT 14.155 0.735 19.25 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.7 on 682 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.375, Adjusted R-squared: 0.372
## F-statistic: 102 on 4 and 682 DF, p-value: <2e-16
Visualizing ANOVA: Color line plot by a second group: “gender”
data_summary <- function(data, varname, groupnames){
require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum <- ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}
sat_act <- sat.act %>%
mutate(.,
education_fac = case_when(education == 0 ~ "none",
education == 1 ~ "some_hs",
education == 2 ~ "high_school",
education == 3 ~ "some_college",
education == 4 ~ "college",
education == 5 ~ "graduate")
)
df3 <- data_summary(sat_act, varname="ACT",
groupnames=c("education_fac", "gender"))
df3$education_fac <- factor(df3$education_fac,levels=c("none", "some_hs","high_school", "some_college", "college","graduate"))
ggplot(df3, aes(x=education_fac, y=ACT, group=gender, color=gender)) +
geom_errorbar(aes(ymin=ACT-sd, ymax=ACT+sd), width=.1,
position=position_dodge(0.05)) +
geom_line() +
geom_point()+
scale_color_brewer(palette="Paired")+theme_minimal()