Linear Regression

Author

jm

Load libraries

library("tidyverse")
library("ggpubr")
library("broom")

Simple regression

iris %>% 
  ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point() +
  stat_smooth(method="lm") +
  stat_regline_equation() +
  theme_bw()
`geom_smooth()` using formula = 'y ~ x'

iris %>% 
  ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point(aes(color=Species)) +
  stat_smooth(method="lm", se = FALSE) +
  stat_regline_equation() +
  theme_bw()
`geom_smooth()` using formula = 'y ~ x'

iris %>% 
  ggplot(aes(x=Sepal.Length, y=Sepal.Width, color=Species)) +
  geom_point() +
  stat_smooth(method="lm", se = FALSE) +
  stat_regline_equation() +
  theme_bw()
`geom_smooth()` using formula = 'y ~ x'

iris %>% 
  lm(Sepal.Width ~ Sepal.Length, data = . ) %>% 
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    3.42      0.254      13.5  1.55e-27
2 Sepal.Length  -0.0619    0.0430     -1.44 1.52e- 1

Multivariate regression

without interaction (with fixed slope)

model_multy <-lm( Sepal.Width ~ Sepal.Length + Species, data = iris) 

model_multy %>% tidy()
# A tibble: 4 × 5
  term              estimate std.error statistic  p.value
  <chr>                <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)          1.68     0.235       7.12 4.46e-11
2 Sepal.Length         0.350    0.0463      7.56 4.19e-12
3 Speciesversicolor   -0.983    0.0721    -13.6  7.62e-28
4 Speciesvirginica    -1.01     0.0933    -10.8  2.41e-20
# predict(model_multy, iris)
# Get the model predictions
iris$predicted_multy <- predict(model_multy, iris)
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point() +
  geom_line(aes(y = predicted_multy), linewidth = 1) +
  labs(title = "Sepal Width vs Sepal Length by Species",
       x = "Sepal Length",
       y = "Sepal Width") +
  theme_minimal()

# iris %>% 
#   ggplot(aes(Sepal.Length, predicted_multy))+
#   geom_point(aes(colour = Species))

With intereaction (different slopes)

model_int <- lm(Sepal.Width ~ Sepal.Length * Species, data = iris)

model_int %>% tidy()
# A tibble: 6 × 5
  term                           estimate std.error statistic  p.value
  <chr>                             <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                      -0.569     0.554     -1.03 3.06e- 1
2 Sepal.Length                      0.799     0.110      7.23 2.55e-11
3 Speciesversicolor                 1.44      0.713      2.02 4.51e- 2
4 Speciesvirginica                  2.02      0.686      2.94 3.85e- 3
5 Sepal.Length:Speciesversicolor   -0.479     0.134     -3.58 4.65e- 4
6 Sepal.Length:Speciesvirginica    -0.567     0.126     -4.49 1.45e- 5
# Get the model predictions
iris$predicted <- predict(model_int, iris)

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point() +
  geom_line(aes(y = predicted), size = 1) +
  #geom_smooth(method="lm", se = FALSE) +
  labs(title = "Sepal Width vs Sepal Length by Species",
       x = "Sepal Length",
       y = "Sepal Width") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.