Code
source(file = file.path("Source", "functions.R"))Read in path and auxiliary functions
source(file = file.path("Source", "functions.R"))Load packages
source(file = file.path("Source", "rpackages.R"))Reading in the data set
library(MASS)
Attache Paket: 'MASS'
Das folgende Objekt ist maskiert 'package:dplyr':
select
data("birthwt")
head(birthwt)Description of the data set
# Überblick über alle Variablen
# Dekription des simulierten Datensatzes
library(Hmisc)
Hmisc::describe(birthwt)birthwt
10 Variables 189 Observations
--------------------------------------------------------------------------------
low
n missing distinct Info Sum Mean
189 0 2 0.644 59 0.3122
--------------------------------------------------------------------------------
age
n missing distinct Info Mean pMedian Gmd .05
189 0 24 0.996 23.24 23 5.919 16
.10 .25 .50 .75 .90 .95
17 19 23 26 31 32
lowest : 14 15 16 17 18, highest: 33 34 35 36 45
--------------------------------------------------------------------------------
lwt
n missing distinct Info Mean pMedian Gmd .05
189 0 75 0.999 129.8 125.5 32.12 94.4
.10 .25 .50 .75 .90 .95
99.6 110.0 121.0 140.0 170.0 188.2
lowest : 80 85 89 90 91, highest: 215 229 235 241 250
--------------------------------------------------------------------------------
race
n missing distinct Info Mean pMedian Gmd
189 0 3 0.822 1.847 2 0.9626
Value 1 2 3
Frequency 96 26 67
Proportion 0.508 0.138 0.354
For the frequency table, variable is rounded to the nearest 0
--------------------------------------------------------------------------------
smoke
n missing distinct Info Sum Mean
189 0 2 0.715 74 0.3915
--------------------------------------------------------------------------------
ptl
n missing distinct Info Mean pMedian Gmd
189 0 4 0.403 0.1958 0 0.3409
Value 0 1 2 3
Frequency 159 24 5 1
Proportion 0.841 0.127 0.026 0.005
For the frequency table, variable is rounded to the nearest 0
--------------------------------------------------------------------------------
ht
n missing distinct Info Sum Mean
189 0 2 0.178 12 0.06349
--------------------------------------------------------------------------------
ui
n missing distinct Info Sum Mean
189 0 2 0.379 28 0.1481
--------------------------------------------------------------------------------
ftv
n missing distinct Info Mean pMedian Gmd
189 0 6 0.832 0.7937 0.5 1.041
Value 0 1 2 3 4 6
Frequency 100 47 30 7 4 1
Proportion 0.529 0.249 0.159 0.037 0.021 0.005
For the frequency table, variable is rounded to the nearest 0
--------------------------------------------------------------------------------
bwt
n missing distinct Info Mean pMedian Gmd .05
189 0 131 1 2945 2958 827.3 1801
.10 .25 .50 .75 .90 .95
2038 2414 2977 3487 3865 3997
lowest : 709 1021 1135 1330 1474, highest: 4167 4174 4238 4593 4990
--------------------------------------------------------------------------------
summary(birthwt) low age lwt race
Min. :0.0000 Min. :14.00 Min. : 80.0 Min. :1.000
1st Qu.:0.0000 1st Qu.:19.00 1st Qu.:110.0 1st Qu.:1.000
Median :0.0000 Median :23.00 Median :121.0 Median :1.000
Mean :0.3122 Mean :23.24 Mean :129.8 Mean :1.847
3rd Qu.:1.0000 3rd Qu.:26.00 3rd Qu.:140.0 3rd Qu.:3.000
Max. :1.0000 Max. :45.00 Max. :250.0 Max. :3.000
smoke ptl ht ui
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000
Mean :0.3915 Mean :0.1958 Mean :0.06349 Mean :0.1481
3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
Max. :1.0000 Max. :3.0000 Max. :1.00000 Max. :1.0000
ftv bwt
Min. :0.0000 Min. : 709
1st Qu.:0.0000 1st Qu.:2414
Median :0.0000 Median :2977
Mean :0.7937 Mean :2945
3rd Qu.:1.0000 3rd Qu.:3487
Max. :6.0000 Max. :4990
# https://www.rdocumentation.org/packages/MASS/versions/7.3-60.0.1/topics/birthwt
birthwt2 <- birthwt%>%
rename(hypertension = ht)%>%
rename(white = race)%>%
mutate(white = ifelse(white == 1, "yes", "no"))%>%
mutate(lwt = lwt/2.20462)
head(birthwt2)Candidate Models
m0 <- formula(bwt ~ white + smoke + hypertension + lwt + age)
fm0 <- glm(m0, data = birthwt2) # Basismodell
aic_fm0 <- aic(model.fit = fm0, data.set = birthwt2)
m1 <- update(m0,~.+ white:smoke)
fm1 <- glm(m1, data = birthwt2)
#summary(fm1)
aic_fm1 <- aic(model.fit=fm1, data.set = birthwt2) #
m2 <- update(m0,~.- lwt + poly(lwt, 2, raw = TRUE))
fm2 <- glm(m2, data = birthwt2)
#summary(fm2)
aic_fm2 <- aic(model.fit=fm2 , data.set = birthwt2)
m3 <- update(m0,~. + white:smoke + white:hypertension)
fm3 <- glm(m3, data = birthwt2)
#summary(fm3)
aic_fm3 <- aic(model.fit=fm3, data.set = birthwt2)
# AIC- und Deviance-Werte extrahieren
model_comparison <- data.frame(
Modell = c("fm0 (Basis)",
"fm1 (+ white:smoke)",
"fm2 (+poly(lwt, 2, raw = TRUE))",
"fm3 (+ white:hypertension)"),
Deviance = c(aic_fm0$dev,
aic_fm1$dev,
aic_fm2$dev,
aic_fm3$dev),
AIC = c(aic_fm0$aic,
aic_fm1$aic,
aic_fm2$aic,
aic_fm3$aic)
)
# Tabelle anzeigen
knitr::kable(model_comparison, caption = "Vergleich der Modelle anhand von Deviance und AIC")| Modell | Deviance | AIC |
|---|---|---|
| fm0 (Basis) | 2990.739 | 3004.739 |
| fm1 (+ white:smoke) | 2988.655 | 3004.655 |
| fm2 (+poly(lwt, 2, raw = TRUE)) | 2990.729 | 3006.729 |
| fm3 (+ white:hypertension) | 2986.900 | 3004.900 |
Marginal Effects:
fm1 is favored based on the AIC, though the difference compared to the other models is minimal.
## Tabelle 3
# race
print("Marginaler prädiktiver Vergleich der dokumentierten Hautfarbe")[1] "Marginaler prädiktiver Vergleich der dokumentierten Hautfarbe"
fm1_slopes_white <- slopes(fm1,variables = "white")|>
group_by(white) |> summarise(mean = mean(estimate), sd = sd(estimate))
fm1_slopes_white# smoke
print("Marginaler prädiktiver Vergleich des dokumentierten Rauchverhaltens")[1] "Marginaler prädiktiver Vergleich des dokumentierten Rauchverhaltens"
fm1_slopes_smoke <- slopes(fm1,variables = c("smoke"))|>
group_by(smoke) |> summarise(mean = mean(estimate), sd = sd(estimate))
fm1_slopes_smokeP-Hacking:
noise <- list()
anz <- 25
set.seed(280624)#
for (i in 1:anz){
noise[[i]] <- 20 -rtruncnorm(length(birthwt2$bwt), a = 0,b = 20, mean = 10, sd = 2)
}
df.noise<-as.data.frame(noise)
names(df.noise) <- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","V10",
"V11","V12","V13","V14","V15","V16","V17","V18","V19",
"V20","V21","V22","V23","V24","V25")
birthwt2_noise <- cbind(birthwt2,df.noise)
# Kandidatenmodelle
###############################################
candidate_models <- list (length = anz)
for(i in 1:anz){
candidate_models[[i]] <- update(m1,paste0("~. + ",names(df.noise)[i]))
}
# candidate_models[[5]]
summary_models <- list (length = anz)
for(i in 1:anz){
summary_models[[i]] <- summary(glm(candidate_models[[i]], data = birthwt2_noise))
}
#summary_models[[5,9,10,16,25]]
########################################################
final.mod <- update(m1,~. + V5 + V9 + V10 + V16 + V25)
fm_final <- glm(final.mod, data = birthwt2_noise)
#summary(fm_final)Summarize results of the estimation of the regression
tidy(fm_final)