# sat_act <- sat.act %>%
#   mutate(., gender_fac = ifelse(gender == 1, "male", "female")) %>%
#   mutate(.,
#     gender_fac = case_when(gender == 1 ~ "male",
#                            gender == 2 ~ "female"),
#     education_fac = case_when(education == 0 ~ "none",
#                               education == 1 ~ "some_hs",
#                               education == 2 ~ "high_school",
#                               education == 3 ~ "some_college",
#                               education == 4 ~ "college",
#                               education == 5 ~ "graduate")
#   )
data("sat.act", package="psych")
sat.act <- sat.act %>% dplyr::rename(sex=gender) %>%
mutate(sex = factor(sex, levels = c(1,2), labels = c("male", "female")))

Chi-Squared

Visualizing Chi-Squared.

vcd::mosaic(~ sex+education, data = sat.act, shade = TRUE, legend = TRUE)

Correlation

The Pearson product moment correlation seeks to measure the linear association between two variables, \(x\) and \(y\) on a standardized scale ranging from \(r = -1 -- 1\).

The correlation of x and y is a covariance that has been standardized by the standard deviations of \(x\) and \(y\). This yields a scale-insensitive measure of the linear association of \(x\) and \(y\). For much more conceptual detail, see: https://psu-psychology.github.io/psy-597-SEM/01_correlation_regression/01_Correlation_and_Regression.html.

\[r_{XY}= \frac{s_{XY}}{s_{X} s_{Y}}\]

Visualizing Correlations

To visualize this relationship, we can pass the raw data to ggplot and get a simple regression line using stat_smooth() with the lm method.

ggplot(sat.act, aes(x=age, y=ACT)) +
  geom_jitter(width=0.1) +
  stat_smooth(method="lm", se=FALSE)

Linear Models

lm_SAT <- lm(SATV ~ SATQ, data=sat.act)

Notes on overarching regression principles

In simple regression, we are interested in a relationship of the form: \[ Y = B_0 + B_1 X \] where \(Y\) is the dependent variable (criterion) and \(X\) is the predictor (covariate). The intercept is represented by \(B0\) and the slope for the \(X\) predictor by \(B1\).

When conducting regression, we typically try to capture linear relationships among variables. We can introduce higher-order polynomial terms (e.g., quadratic models) or splines (more flexible shapes), but this beyond the scope here.

Alternative graphical diagnostic plots.

We can also get useful diagnostic plots for free using the plot() function:

plot(lm_SAT)

Notes on bootstraping

Nonparametric bootstrapping approximates the sampling distribution for a statistic of interest (e.g., a slope) by resampling the existing data with replacement many times and examining the resulting density.

Viszualizing regression results

with two predictors – one categorical, one continous

It appears that these are both influential predictors. We could examine the relationship graphically.

ggplot(sat.act, aes(x=SATV, y=SATQ, color=sex)) + 
  geom_point() + 
  scale_color_discrete(name="Sex", breaks = c("1", "2"), labels = c("Male", "Female")) +
  labs(title = "Multiple Regression", x = "SATV", y = "SATQ") +
  stat_smooth(method=lm, se=FALSE)

regression interaction plots (one categorical, one continous)

# 2-way interation plotting
ggplot(data=sat.act, 
       aes(x=age, y=SATQ, colour = sex)) +
  geom_jitter()+
  labs(x = "Age", y = "Math Score", colour = "Sex") +
  theme_bw()+
  theme(
    plot.background = element_blank()
    ,panel.grid.major = element_blank()
    ,panel.grid.minor = element_blank()
    ,panel.border = element_blank()
  ) +
  stat_smooth(method='lm', se=TRUE, fullrange=TRUE) +
  scale_color_manual(labels = c("Male", "Female"), values = c("#d21959", "#4aded3"))

theme(axis.text=element_text(size=12),
      axis.title=element_text(size=14))

## List of 2
##  $ axis.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 14
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 12
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

emmeans plots

The emmeans package also provides useful plots to understand pairwise differences:

sat.act$agef[sat.act$age < 25] <- 1
sat.act$agef[sat.act$age >= 25 & sat.act$age <= 50] <- 2
sat.act$agef[sat.act$age > 50] <- 3

# setting as factors
sat.act$agef <- as.factor(sat.act$agef)
sat.act$gender <- as.factor(sat.act$sex)

# running the model
sat.lm <- lm(SATQ ~ agef + SATV, data = sat.act)
sat.emm.s <- emmeans(sat.lm, "agef")
plot(sat.emm.s, comparisons = TRUE)

The blue bars are confidence intervals for the EMMs, and the red arrows are for the comparisons among them. If an arrow from one mean overlaps an arrow from another group, the difference is not significant, based on the adjust setting (which defaults to “tukey”). (Note: Don’t ever use confidence intervals for EMMs to perform comparisons; they can be very misleading.)

visualizing the interaction from pairwise comparisons

We can visualize the interaction as follows:

ggplot(data=sat.act, 
       aes(x=SATV, y=SATQ, colour = factor(agef))) +
  geom_jitter()+
  labs(x = "Verbal Score", y = "Math Score", colour = "Age") +
  theme_bw()+
  theme(
    plot.background = element_blank()
    ,panel.grid.major = element_blank()
    ,panel.grid.minor = element_blank()
    ,panel.border = element_blank()
  ) +
  stat_smooth(method='lm', se=FALSE, fullrange=TRUE) +
  scale_color_manual(labels = c("Under 25", "25-50", "Over 50"), values = c("red", "green", "blue"))

theme(axis.text=element_text(size=12),
      axis.title=element_text(size=14))

## List of 2
##  $ axis.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 14
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 12
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Testing non-linear effects

A few other emmeans features

In the sat.act dataset, we have treated age as a factor. If we keep this representation (as opposed to entering age as continuous), we can easily get orthogonal polynomial contrasts in emmeans. For example, is the effect of age linearly related to SATV, or might it be quadratic or cubic?

sat.emm.p <- emmeans(sat.lm, "agef")
ply <- contrast(sat.emm.p, "poly")
ply

##  contrast  estimate   SE  df t.ratio p.value
##  linear        15.5 19.0 683 0.819   0.4130 
##  quadratic     15.7 22.1 683 0.712   0.4770

coef(ply) #show the contrast coefficients

##   agef c.1 c.2
## 1    1  -1   1
## 2    2   0  -2
## 3    3   1   1

Multivariate regression models

Finally, we can examine effects in multivariate regression models (i.e., multiple DVs). Here, we can examine the both the verbal and math scores and if they are associated with age, gender (sex), education, or ACT scores.

org.int <- lm(cbind(SATV, SATQ) ~ age + gender + education + ACT, data = sat.act)
summary(org.int)

## Response SATV :
## 
## Call:
## lm(formula = SATV ~ age + gender + education + ACT, data = sat.act)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -502.0  -47.9    7.6   60.1  239.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  261.2947    23.3033   11.21   <2e-16 ***
## age           -1.3893     0.4537   -3.06   0.0023 ** 
## genderfemale  -0.0713     7.5002   -0.01   0.9924    
## education      1.4926     3.0573    0.49   0.6256    
## ACT           13.3790     0.7487   17.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93.3 on 682 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.322 
## F-statistic: 82.3 on 4 and 682 DF,  p-value: <2e-16
## 
## 
## Response SATQ :
## 
## Call:
## lm(formula = SATQ ~ age + gender + education + ACT, data = sat.act)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -492.5  -48.0    8.2   63.3  257.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   259.647     22.891   11.34  < 2e-16 ***
## age            -1.352      0.446   -3.03   0.0025 ** 
## genderfemale  -34.805      7.367   -4.72  2.8e-06 ***
## education       1.102      3.003    0.37   0.7138    
## ACT            14.155      0.735   19.25  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 91.7 on 682 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.375,  Adjusted R-squared:  0.372 
## F-statistic:  102 on 4 and 682 DF,  p-value: <2e-16

ANOVA

Visualizing ANOVA: Color line plot by a second group: “gender”

data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE))
  }
  data_sum <- ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
 return(data_sum)
}
sat_act <- sat.act %>%
  mutate(.,
    education_fac = case_when(education == 0 ~ "none",
                              education == 1 ~ "some_hs",
                              education == 2 ~ "high_school",
                              education == 3 ~ "some_college",
                              education == 4 ~ "college",
                              education == 5 ~ "graduate")
  )
df3 <- data_summary(sat_act, varname="ACT", 
                    groupnames=c("education_fac", "gender"))
df3$education_fac <- factor(df3$education_fac,levels=c("none", "some_hs","high_school", "some_college", "college","graduate"))
ggplot(df3, aes(x=education_fac, y=ACT, group=gender, color=gender)) + 
    geom_errorbar(aes(ymin=ACT-sd, ymax=ACT+sd), width=.1, 
                  position=position_dodge(0.05)) +
   geom_line() + 
   geom_point()+
   scale_color_brewer(palette="Paired")+theme_minimal()

Supplemental Materials for Data Analyses

Daniel Albohn, Kayla Brown, & Yiming Qian

2019-08-23 15:17:36