Code
source(file = file.path("Source", "functions.R"))Read auxiliary functions
source(file = file.path("Source", "functions.R"))Load packages
source(file = file.path("Source", "rpackages.R"))Reading in the simulated data set
load(file.path("Source","gpg.RData"))Description of the simulated data set
# Überblick über alle Variablen
Hmisc::describe(gender_gap)gender_gap
8 Variables 400 Observations
--------------------------------------------------------------------------------
id
n missing distinct Info Mean pMedian Gmd .05
400 0 200 1 100.5 100.5 66.83 10.95
.10 .25 .50 .75 .90 .95
20.90 50.75 100.50 150.25 180.10 190.05
lowest : 1 2 3 4 5, highest: 196 197 198 199 200
--------------------------------------------------------------------------------
female
n missing distinct Info Sum Mean
400 0 2 0.75 197 0.4925
--------------------------------------------------------------------------------
age
n missing distinct Info Mean pMedian Gmd .05
400 0 44 0.999 41.63 41.5 13.83 23.00
.10 .25 .50 .75 .90 .95
25.00 30.75 42.00 52.00 58.00 60.00
lowest : 21 22 23 24 25, highest: 60 61 63 64 65
--------------------------------------------------------------------------------
school
n missing distinct Info Sum Mean
400 0 2 0.73 167 0.4175
--------------------------------------------------------------------------------
abi
n missing distinct Info Sum Mean
400 0 2 0.687 142 0.355
--------------------------------------------------------------------------------
uni
n missing distinct Info Sum Mean
400 0 2 0.724 163 0.4075
--------------------------------------------------------------------------------
f_job
n missing distinct Info Sum Mean
400 0 2 0.748 190 0.475
--------------------------------------------------------------------------------
income
n missing distinct Info Mean pMedian Gmd .05
400 0 317 1 1277 1276 308.8 859.0
.10 .25 .50 .75 .90 .95
940.8 1072.0 1274.1 1483.0 1650.9 1741.1
lowest : 637.155 677.314 716.796 722.134 724.943
highest: 1828.4 1849.37 1861.13 1876.41 1984.26
--------------------------------------------------------------------------------
# Univariate Beschreibung von income
gender_gap %>%
ggplot(aes(x = income)) +
geom_boxplot(fill = "steelblue") +
labs(title = "Verteilung von Einkommen", x = "Einkommen")gender_gap %>%
summarise(
mean_income = mean(income, na.rm = TRUE),
sd_income = sd(income, na.rm = TRUE)
)# Kreuztabelle
# Relative Häufigkeiten berechnen
gender_gap %>%
count(female, f_job) %>%
group_by(female) %>%
mutate(perc = round(100 * n / sum(n), 1)) %>%
ungroup() -> rel_table
rel_table %>%
ggplot(aes(x = f_job, y = perc, fill = as.factor(female))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Relative Häufigkeiten von f_job nach Geschlecht",
x = "Beruf", y = "Anteil in %", fill = "female") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))rel_table %>%
select(female, f_job, perc) %>%
pivot_wider(names_from = f_job, values_from = perc, values_fill = 0)Sections 2.4 and 2.5
m0 <- formula(income ~ female + age + abi + uni + f_job)
m1 <- update(m0, ~ . + female:f_job)
m2 <- update(m0, ~ . - age + poly(age, 2, raw = TRUE))
m3 <- update(m2, ~ . + female:abi + female:uni)
fm0 <- glm(m0, data = gender_gap)
fm1 <- glm(m1, data = gender_gap)
fm2 <- glm(m2, data = gender_gap)
fm3 <- glm(m3, data = gender_gap)
res<-summary(fm0)
# Wert der Test-Statistik
res$coefficients[2, 3][1] -19.95646
# p-Wert
pv <- pt(res$coefficients[2, 3], df = res$df.residual, lower = TRUE)
# Ablehnbereich
qt(0.05, df = res$df.residual, lower = TRUE)[1] -1.64873
aic_fm0 <- aic(model.fit = fm0)
#$dev
#[1] 4526.809
#$aic
#[1] 4540.809
m1 <- update(m0,~.+ female:f_job)
fm1 <- glm(m1, data = gender_gap)
#summary(fm1)
aic_fm1 <- aic(model.fit=fm1) #
#$dev
#[1] 4526.512
#$aic
#[1] 4542.512
m2 <- update(m0,~.- age + poly(age, 2, raw = TRUE))
fm2 <- glm(m2, data = gender_gap)
#summary(fm2)
aic_fm2 <- aic(model.fit=fm2)
#$dev
#[1] 4444.715
#$aic
#[1] 4460.715
m3 <- update(m2,~. + female:abi + female:uni)
fm3 <- glm(m3, data = gender_gap)
#summary(fm3)
aic_fm3 <- aic(model.fit=fm3)
#$dev
#[1] 4443.553
#$aic
#[1] 4463.553
# AIC- und Deviance-Werte extrahieren
model_comparison <- data.frame(
Modell = c("fm0 (Basis)",
"fm1 (+ female:f_job)",
"fm2 (poly(age, 2))",
"fm3 (+ female:abi + female:uni)"),
Deviance = c(aic_fm0$dev,
aic_fm1$dev,
aic_fm2$dev,
aic_fm3$dev),
AIC = c(aic_fm0$aic,
aic_fm1$aic,
aic_fm2$aic,
aic_fm3$aic)
)
# Tabelle anzeigen
knitr::kable(model_comparison, caption = "Vergleich der Modelle anhand von Deviance und AIC")| Modell | Deviance | AIC |
|---|---|---|
| fm0 (Basis) | 4526.809 | 4540.809 |
| fm1 (+ female:f_job) | 4526.512 | 4542.512 |
| fm2 (poly(age, 2)) | 4444.715 | 4460.715 |
| fm3 (+ female:abi + female:uni) | 4443.553 | 4463.553 |
Section 2.6
## Tabelle 3
tidy(fm2) %>%
select(term, estimate, std.error, statistic, p.value) %>%
kable(format = "html", digits = 3, caption = "Koeffizienten – Modell fm2") | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 467.550 | 41.627 | 11.232 | 0 |
| female | -192.060 | 8.771 | -21.898 | 0 |
| abi | 76.621 | 7.346 | 10.431 | 0 |
| uni | 496.883 | 8.005 | 62.071 | 0 |
| f_job | -59.125 | 9.233 | -6.403 | 0 |
| poly(age, 2, raw = TRUE)1 | 27.409 | 2.074 | 13.218 | 0 |
| poly(age, 2, raw = TRUE)2 | -0.234 | 0.025 | -9.462 | 0 |
confint(fm2) %>%
as.data.frame() %>%
tibble::rownames_to_column("term") %>%
rename(
lower = `2.5 %`,
upper = `97.5 %`
) %>%
kable(format = "html", digits = 3, caption = "Konfidenzintervalle – Modell fm2")Waiting for profiling to be done...
| term | lower | upper |
|---|---|---|
| (Intercept) | 385.963 | 549.138 |
| female | -209.249 | -174.870 |
| abi | 62.224 | 91.019 |
| uni | 481.194 | 512.573 |
| f_job | -77.222 | -41.028 |
| poly(age, 2, raw = TRUE)1 | 23.345 | 31.474 |
| poly(age, 2, raw = TRUE)2 | -0.282 | -0.185 |
### Ergebnisdarstellung
print("Marginaler prädiktiver Vergleich des Alters")[1] "Marginaler prädiktiver Vergleich des Alters"
fm2_slopes_age <- slopes(fm2,variables = c("age"))|>
summarise(mean = mean(estimate), sd = sd(estimate))
fm2_slopes_ageprint("nach Geschlecht")[1] "nach Geschlecht"
# Getrennt nach Geschlecht
slopes(fm2,variables = c("age"))|> group_by(female) |>
summarise(mean = mean(estimate), sd = sd(estimate), n = n())Section 4
noise <- list()
anz <- 25
set.seed(12022)#
for (i in 1:anz){
noise[[i]] <- 20-rtruncnorm(400, a=0,b=20, mean = 10, sd = 2)
}
df.noise<-as.data.frame(noise)
names(df.noise) <- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","V10",
"V11","V12","V13","V14","V15","V16","V17","V18","V19",
"V20","V21","V22","V23","V24","V25")
gender_gap_noise <- cbind(gender_gap,df.noise)
# Kandidatenmodelle
###############################################
candidate_models <- list (length=anz)
for(i in 1:anz){
candidate_models[[i]] <- update(m2,paste0("~. + ",names(df.noise)[i]))
}
# candidate_models[[5]]
summary_models <- list (length=anz)
for(i in 1:anz){
summary_models[[i]] <- summary(glm(candidate_models[[i]], data = gender_gap_noise))
}
# summary_models[[6]]
# summary_models[[17]]
# summary_models[[19]]
# summary_models[[25]]
########################################################
fm_final <- glm(update(m2,~. + V6+ V17+ V19 + V25) , data = gender_gap_noise)
#summary(fm_final)Summarize results of the estimation of the regression
tidy(fm_final)