Multiple lineare Regression: Ein vertiefter Einblick in die statistische Modellierung

Author

Mariana Nold, Florian Meinfelder

Published

2025-09-29

Read auxiliary functions

Code
source(file = file.path("Source", "functions.R"))

Load packages

Code
source(file = file.path("Source", "rpackages.R"))

Reading in the simulated data set

Code
load(file.path("Source","gpg.RData"))

Description of the simulated data set

Code
# Überblick über alle Variablen
Hmisc::describe(gender_gap)
gender_gap 

 8  Variables      400  Observations
--------------------------------------------------------------------------------
id 
       n  missing distinct     Info     Mean  pMedian      Gmd      .05 
     400        0      200        1    100.5    100.5    66.83    10.95 
     .10      .25      .50      .75      .90      .95 
   20.90    50.75   100.50   150.25   180.10   190.05 

lowest :   1   2   3   4   5, highest: 196 197 198 199 200
--------------------------------------------------------------------------------
female 
       n  missing distinct     Info      Sum     Mean 
     400        0        2     0.75      197   0.4925 

--------------------------------------------------------------------------------
age 
       n  missing distinct     Info     Mean  pMedian      Gmd      .05 
     400        0       44    0.999    41.63     41.5    13.83    23.00 
     .10      .25      .50      .75      .90      .95 
   25.00    30.75    42.00    52.00    58.00    60.00 

lowest : 21 22 23 24 25, highest: 60 61 63 64 65
--------------------------------------------------------------------------------
school 
       n  missing distinct     Info      Sum     Mean 
     400        0        2     0.73      167   0.4175 

--------------------------------------------------------------------------------
abi 
       n  missing distinct     Info      Sum     Mean 
     400        0        2    0.687      142    0.355 

--------------------------------------------------------------------------------
uni 
       n  missing distinct     Info      Sum     Mean 
     400        0        2    0.724      163   0.4075 

--------------------------------------------------------------------------------
f_job 
       n  missing distinct     Info      Sum     Mean 
     400        0        2    0.748      190    0.475 

--------------------------------------------------------------------------------
income 
       n  missing distinct     Info     Mean  pMedian      Gmd      .05 
     400        0      317        1     1277     1276    308.8    859.0 
     .10      .25      .50      .75      .90      .95 
   940.8   1072.0   1274.1   1483.0   1650.9   1741.1 

lowest : 637.155 677.314 716.796 722.134 724.943
highest: 1828.4  1849.37 1861.13 1876.41 1984.26
--------------------------------------------------------------------------------
Code
# Univariate Beschreibung von income
gender_gap %>% 
  ggplot(aes(x = income)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "Verteilung von Einkommen", x = "Einkommen")

Code
gender_gap %>% 
  summarise(
    mean_income = mean(income, na.rm = TRUE),
    sd_income = sd(income, na.rm = TRUE)
  )
Code
# Kreuztabelle
# Relative Häufigkeiten berechnen
gender_gap %>%
  count(female, f_job) %>%
  group_by(female) %>%
  mutate(perc = round(100 * n / sum(n), 1)) %>%
  ungroup() -> rel_table

rel_table %>%
  ggplot(aes(x = f_job, y = perc, fill = as.factor(female))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Relative Häufigkeiten von f_job nach Geschlecht",
       x = "Beruf", y = "Anteil in %", fill = "female") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
rel_table %>%
  select(female, f_job, perc) %>%
  pivot_wider(names_from = f_job, values_from = perc, values_fill = 0)

Sections 2.4 and 2.5

Code
m0 <- formula(income ~ female + age + abi + uni + f_job)
m1 <- update(m0, ~ . + female:f_job)
m2 <- update(m0, ~ . - age + poly(age, 2, raw = TRUE))
m3 <- update(m2, ~ . + female:abi + female:uni)

fm0 <- glm(m0, data = gender_gap)
fm1 <- glm(m1, data = gender_gap)
fm2 <- glm(m2, data = gender_gap)
fm3 <- glm(m3, data = gender_gap)

res<-summary(fm0)
# Wert der Test-Statistik
res$coefficients[2, 3]
[1] -19.95646
Code
# p-Wert
pv <- pt(res$coefficients[2, 3], df = res$df.residual, lower = TRUE)
# Ablehnbereich
qt(0.05, df = res$df.residual, lower = TRUE)
[1] -1.64873
Code
aic_fm0 <- aic(model.fit = fm0)
#$dev
#[1] 4526.809

#$aic
#[1] 4540.809
m1 <- update(m0,~.+ female:f_job)
fm1 <- glm(m1, data = gender_gap) 
#summary(fm1)
aic_fm1 <- aic(model.fit=fm1) # 
#$dev
#[1] 4526.512

#$aic
#[1] 4542.512

m2 <- update(m0,~.- age + poly(age, 2, raw = TRUE))
fm2 <- glm(m2, data = gender_gap) 
#summary(fm2)

aic_fm2 <- aic(model.fit=fm2)
#$dev
#[1] 4444.715

#$aic
#[1] 4460.715

m3 <- update(m2,~. + female:abi + female:uni)
fm3 <- glm(m3, data = gender_gap) 
#summary(fm3)
aic_fm3 <- aic(model.fit=fm3) 
#$dev
#[1] 4443.553

#$aic
#[1] 4463.553

# AIC- und Deviance-Werte extrahieren
model_comparison <- data.frame(
  Modell = c("fm0 (Basis)", 
             "fm1 (+ female:f_job)", 
             "fm2 (poly(age, 2))", 
             "fm3 (+ female:abi + female:uni)"),
  Deviance = c(aic_fm0$dev, 
               aic_fm1$dev, 
               aic_fm2$dev, 
               aic_fm3$dev),
  AIC = c(aic_fm0$aic, 
          aic_fm1$aic, 
          aic_fm2$aic, 
          aic_fm3$aic)
)

# Tabelle anzeigen
knitr::kable(model_comparison, caption = "Vergleich der Modelle anhand von Deviance und AIC")
Vergleich der Modelle anhand von Deviance und AIC
Modell Deviance AIC
fm0 (Basis) 4526.809 4540.809
fm1 (+ female:f_job) 4526.512 4542.512
fm2 (poly(age, 2)) 4444.715 4460.715
fm3 (+ female:abi + female:uni) 4443.553 4463.553

Section 2.6

Code
## Tabelle 3
tidy(fm2) %>%
  select(term, estimate, std.error, statistic, p.value) %>%
  kable(format = "html", digits = 3, caption = "Koeffizienten – Modell fm2") 
Koeffizienten – Modell fm2
term estimate std.error statistic p.value
(Intercept) 467.550 41.627 11.232 0
female -192.060 8.771 -21.898 0
abi 76.621 7.346 10.431 0
uni 496.883 8.005 62.071 0
f_job -59.125 9.233 -6.403 0
poly(age, 2, raw = TRUE)1 27.409 2.074 13.218 0
poly(age, 2, raw = TRUE)2 -0.234 0.025 -9.462 0
Code
confint(fm2) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("term") %>%
  rename(
    lower = `2.5 %`,
    upper = `97.5 %`
  ) %>%
  kable(format = "html", digits = 3, caption = "Konfidenzintervalle – Modell fm2")
Waiting for profiling to be done...
Konfidenzintervalle – Modell fm2
term lower upper
(Intercept) 385.963 549.138
female -209.249 -174.870
abi 62.224 91.019
uni 481.194 512.573
f_job -77.222 -41.028
poly(age, 2, raw = TRUE)1 23.345 31.474
poly(age, 2, raw = TRUE)2 -0.282 -0.185
Code
### Ergebnisdarstellung

print("Marginaler prädiktiver Vergleich des Alters")
[1] "Marginaler prädiktiver Vergleich des Alters"
Code
fm2_slopes_age <- slopes(fm2,variables = c("age"))|> 
  summarise(mean = mean(estimate), sd = sd(estimate))

fm2_slopes_age
Code
print("nach Geschlecht")
[1] "nach Geschlecht"
Code
# Getrennt nach Geschlecht

slopes(fm2,variables = c("age"))|> group_by(female) |> 
  summarise(mean = mean(estimate), sd = sd(estimate), n = n())

Section 4

Code
noise <- list()
anz <- 25
set.seed(12022)#
for (i in 1:anz){
        noise[[i]] <- 20-rtruncnorm(400, a=0,b=20, mean = 10, sd = 2)
}
df.noise<-as.data.frame(noise)
names(df.noise) <- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","V10",
                     "V11","V12","V13","V14","V15","V16","V17","V18","V19",
                     "V20","V21","V22","V23","V24","V25")


gender_gap_noise <- cbind(gender_gap,df.noise)
# Kandidatenmodelle
###############################################
candidate_models <- list (length=anz)
for(i in 1:anz){
  candidate_models[[i]] <- update(m2,paste0("~. + ",names(df.noise)[i]))
}
# candidate_models[[5]] 

summary_models <- list (length=anz)
for(i in 1:anz){
  summary_models[[i]] <- summary(glm(candidate_models[[i]], data = gender_gap_noise))
}
# summary_models[[6]] 
# summary_models[[17]] 
# summary_models[[19]] 
# summary_models[[25]]

########################################################

fm_final <- glm(update(m2,~. + V6+ V17+ V19 + V25) , data = gender_gap_noise) 
#summary(fm_final)

Summarize results of the estimation of the regression

Code
tidy(fm_final)