library(tidyverse)
library(corrr)
library(fastDummies)
library(corrplot)
library(Metrics)
14 Probeklausur Lösungsansatz
14.1 Pakete und Datenladen
library(readr)
<- read_csv("d_train.csv") d_train
Rows: 205 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): species, island, sex
dbl (6): ID, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv("d_test.csv") d_test
Rows: 134 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): species, island, sex
dbl (5): ID, bill_length_mm, bill_depth_mm, flipper_length_mm, year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(d_train)
14.2 Datenjudo
%>%
d_train summarise((across(everything(),~sum(is.na(.x)))))
%>%
d_train select(body_mass_g) %>%
summarise(preis_avg=mean(body_mass_g),
preis_md = median(body_mass_g),
preis_sd = sd(body_mass_g),
preis_iqr = IQR(body_mass_g))
ggplot(d_train, aes(x=body_mass_g))+
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(d_train, aes(x=body_mass_g))+
geom_density()
ggplot(data = d_train)+
geom_point(mapping = aes(x = body_mass_g, y = bill_length_mm))+
geom_smooth(mapping = aes(x = body_mass_g, y = bill_length_mm))
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
ggplot(data = d_train)+
geom_point(mapping = aes(x = body_mass_g, y = bill_depth_mm))+
geom_smooth(mapping = aes(x = body_mass_g, y = bill_depth_mm))
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
ggplot(data = d_train)+
geom_point(mapping = aes(x = body_mass_g, y = flipper_length_mm))+
geom_smooth(mapping = aes(x = body_mass_g, y = flipper_length_mm))
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
%>%
d_train mutate(across(where(is.integer), as.numeric)) %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "variable") %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(~ variable, scales = "free")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 3 rows containing non-finite outside the scale range
(`stat_bin()`).
%>%
d_train sapply(class)
ID species island bill_length_mm
"numeric" "character" "character" "numeric"
bill_depth_mm flipper_length_mm body_mass_g sex
"numeric" "numeric" "numeric" "character"
year
"numeric"
::vis_dat(d_train) visdat
%>%
d_train select(where(is.numeric)) %>%
summarise(across(
.cols = everything(),
.fns = ~cor(.,d_train$body_mass_g, use = "complete.obs")
%>%
)) pivot_longer(everything()) %>%
arrange(-abs(value))
%>%
d_train summarise(q75=quantile(body_mass_g,
prob = .75,
na.rm = TRUE),
q25=quantile(body_mass_g,
prob = .25,
na.rm = TRUE))
%>%
d_train summarise(body_mass_g_iqr = IQR(body_mass_g,na.rm = TRUE))
<- c(4750 + 1.5 * 1200, 3550 - 1.5 * 1200) body_mass_g_grenzwerte
%>%
d_train summarise(q75=quantile(bill_length_mm,
prob = .75,
na.rm = TRUE),
q25=quantile(bill_length_mm,
prob = .25,
na.rm = TRUE))
%>%
d_train summarise(bill_length_mm_iqr = IQR(bill_length_mm,na.rm = TRUE))
<- c(48.775 + 1.5 * 9.1, 39.675 - 1.5 * 9.1) bill_length_mm_grenzwerte
%>%
d_train summarise(q75=quantile(bill_depth_mm,
prob = .75,
na.rm = TRUE),
q25=quantile(bill_depth_mm,
prob = .25,
na.rm = TRUE))
%>%
d_train summarise(bill_depth_mm_iqr = IQR(bill_depth_mm,na.rm = TRUE))
<- c(18.8 + 1.5 * 3.5, 15.3 - 1.5 * 3.5) bill_depth_mm_grenzwerte
%>%
d_train summarise(q75=quantile(flipper_length_mm,
prob = .75,
na.rm = TRUE),
q25=quantile(flipper_length_mm,
prob = .25,
na.rm = TRUE))
%>%
d_train summarise(flipper_length_mm_iqr = IQR(flipper_length_mm,na.rm = TRUE))
<- c(214 + 1.5 * 24, 190 - 1.5 * 24) flipper_length_mm_grenzwerte
%>%
d_train count(bill_depth_mm > 24.1)
%>%
d_train count(bill_depth_mm < 10.1)
%>%
d_train count(bill_length_mm > 62.4)
%>%
d_train count(bill_length_mm < 26)
%>%
d_train count(body_mass_g > 6550)
%>%
d_train count(body_mass_g < 1750)
%>%
d_train count(flipper_length_mm > 250)
%>%
d_train count(flipper_length_mm < 154)
%>%
d_train summarise((across(everything(),~sum(is.na(.x)))))
<- d_train %>%
d_train_ohneNA replace_na(replace = list(bill_length_mm = mean(.$bill_length_mm, na.rm = TRUE))) %>%
replace_na(replace = list(bill_depth_mm = mean(.$bill_depth_mm, na.rm = TRUE)))
<- d_train_ohneNA %>%
d_train_ohneNA replace_na(list(flipper_length_mm = 0))
%>%
d_train_ohneNA summarise((across(everything(),~sum(is.na(.x)))))
%>%
d_train ggplot() +
aes(x = body_mass_g, y = bill_length_mm) +
geom_boxplot()
Warning: Continuous x aesthetic
ℹ did you forget `aes(group = ...)`?
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
%>%
d_train ggplot() +
aes(x = body_mass_g, y = bill_depth_mm) +
geom_boxplot()
Warning: Continuous x aesthetic
ℹ did you forget `aes(group = ...)`?
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
%>%
d_train ggplot() +
aes(x = body_mass_g, y = flipper_length_mm) +
geom_boxplot()
Warning: Continuous x aesthetic
ℹ did you forget `aes(group = ...)`?
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
%>%
d_train ggplot() +
aes(x = body_mass_g, y = bill_length_mm) +
geom_point()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
%>%
d_train ggplot() +
aes(x = body_mass_g, y = bill_depth_mm) +
geom_point()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
%>%
d_train ggplot() +
aes(x = body_mass_g, y = flipper_length_mm) +
geom_point()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
<- subset(d_train_ohneNA, select = c(body_mass_g,bill_depth_mm, flipper_length_mm, bill_length_mm )) subset_cor
<- cor(subset_cor)
Korr_tab
Korr_tab
body_mass_g bill_depth_mm flipper_length_mm bill_length_mm
body_mass_g 1.0000000 -0.4756584 0.6163949 0.5809113
bill_depth_mm -0.4756584 1.0000000 -0.3880401 -0.2149026
flipper_length_mm 0.6163949 -0.3880401 1.0000000 0.4525609
bill_length_mm 0.5809113 -0.2149026 0.4525609 1.0000000
%>%
d_train_ohneNA mutate(species = factor(species)) %>%
ggplot(aes(x= body_mass_g, y= bill_depth_mm, color = species))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(species = factor(species)) %>%
ggplot(aes(x= body_mass_g, y= flipper_length_mm, color = species))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(species = factor(species)) %>%
ggplot(aes(x= body_mass_g, y= bill_length_mm, color = species))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(island = factor(island)) %>%
ggplot(aes(x= body_mass_g, y= bill_depth_mm, color = island))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(island = factor(island)) %>%
ggplot(aes(x= body_mass_g, y= flipper_length_mm, color = island))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(species = factor(island)) %>%
ggplot(aes(x= body_mass_g, y= bill_length_mm, color = island))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(sex = factor(sex)) %>%
ggplot(aes(x= body_mass_g, y= bill_depth_mm, color = sex))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(sex = factor(sex)) %>%
ggplot(aes(x= body_mass_g, y= flipper_length_mm, color = sex))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
%>%
d_train_ohneNA mutate(sex = factor(sex)) %>%
ggplot(aes(x= body_mass_g, y= bill_length_mm, color = sex))+
geom_point()+
geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'
14.3 Modellierung
<- lm(body_mass_g~., data = d_train_ohneNA)
lm1_all_in summary(lm1_all_in)
Call:
lm(formula = body_mass_g ~ ., data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-497.76 -135.02 -6.11 134.88 666.28
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 215677.757 37545.724 5.744 3.66e-08 ***
ID 7.420 0.540 13.740 < 2e-16 ***
speciesChinstrap -442.933 82.669 -5.358 2.44e-07 ***
speciesGentoo 99.016 130.592 0.758 0.44927
islandDream -93.871 55.434 -1.693 0.09204 .
islandTorgersen -66.964 56.482 -1.186 0.23728
bill_length_mm 17.172 6.678 2.571 0.01090 *
bill_depth_mm 26.326 17.634 1.493 0.13714
flipper_length_mm 12.755 2.767 4.610 7.41e-06 ***
sexmale 148.302 46.653 3.179 0.00173 **
year -107.561 18.734 -5.741 3.71e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 196.5 on 188 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.9418, Adjusted R-squared: 0.9388
F-statistic: 304.5 on 10 and 188 DF, p-value: < 2.2e-16
sqrt(mean(lm1_all_in$residuals^2))
[1] 190.9789
<- lm(body_mass_g~species + flipper_length_mm + sex + year, data = d_train_ohneNA )
lm2 summary(lm2)
Call:
lm(formula = body_mass_g ~ species + flipper_length_mm + sex +
year, data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-702.3 -192.9 -11.0 204.2 816.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 106966.975 52276.996 2.046 0.0421 *
speciesChinstrap -146.330 56.977 -2.568 0.0110 *
speciesGentoo 710.509 107.102 6.634 3.21e-10 ***
flipper_length_mm 24.205 3.699 6.544 5.26e-10 ***
sexmale 464.397 49.007 9.476 < 2e-16 ***
year -53.837 26.126 -2.061 0.0407 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 286.2 on 193 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.8733, Adjusted R-squared: 0.8701
F-statistic: 266.2 on 5 and 193 DF, p-value: < 2.2e-16
sqrt(mean(lm2$residuals^2))
[1] 281.842
<- lm(body_mass_g~ flipper_length_mm:species + island:bill_depth_mm + island:bill_length_mm + bill_length_mm:sex +species + flipper_length_mm + sex + year, data = d_train_ohneNA)
lm3 summary(lm3)
Call:
lm(formula = body_mass_g ~ flipper_length_mm:species + island:bill_depth_mm +
island:bill_length_mm + bill_length_mm:sex + species + flipper_length_mm +
sex + year, data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-743.12 -165.80 2.24 175.77 780.22
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 102598.089 52605.576 1.950 0.052656 .
speciesChinstrap 108.852 1545.407 0.070 0.943923
speciesGentoo -469.020 1577.469 -0.297 0.766554
flipper_length_mm 19.432 5.302 3.665 0.000323 ***
sexmale 872.246 402.545 2.167 0.031533 *
year -52.273 26.277 -1.989 0.048154 *
flipper_length_mm:speciesChinstrap -1.489 7.993 -0.186 0.852426
flipper_length_mm:speciesGentoo 4.933 7.656 0.644 0.520220
islandBiscoe:bill_depth_mm 39.366 32.310 1.218 0.224628
islandDream:bill_depth_mm 85.184 34.480 2.471 0.014400 *
islandTorgersen:bill_depth_mm 45.005 34.797 1.293 0.197508
islandBiscoe:bill_length_mm 39.792 12.993 3.062 0.002524 **
islandDream:bill_length_mm 15.422 16.737 0.921 0.358039
islandTorgersen:bill_length_mm 33.962 16.578 2.049 0.041922 *
bill_length_mm:sexmale -12.705 9.080 -1.399 0.163459
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 277 on 184 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.8869, Adjusted R-squared: 0.8782
F-statistic: 103 on 14 and 184 DF, p-value: < 2.2e-16
sqrt(mean(lm3$residuals^2))
[1] 266.3926
<- lm(body_mass_g ~ flipper_length_mm + sex + island:bill_length_mm + year, data = d_train_ohneNA )
lm4 summary(lm4)
Call:
lm(formula = body_mass_g ~ flipper_length_mm + sex + island:bill_length_mm +
year, data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-768.8 -184.7 2.7 192.6 1079.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 165831.813 53955.688 3.073 0.00242 **
flipper_length_mm 35.375 2.897 12.209 < 2e-16 ***
sexmale 373.106 48.119 7.754 5.09e-13 ***
year -84.349 26.900 -3.136 0.00198 **
islandBiscoe:bill_length_mm 14.589 6.484 2.250 0.02559 *
islandDream:bill_length_mm 3.646 5.891 0.619 0.53677
islandTorgersen:bill_length_mm 6.881 6.874 1.001 0.31803
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 302.5 on 192 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.8592, Adjusted R-squared: 0.8548
F-statistic: 195.3 on 6 and 192 DF, p-value: < 2.2e-16
sqrt(mean(lm4$residuals^2))
[1] 297.118
<- lm(body_mass_g ~ .+flipper_length_mm:species + island:bill_depth_mm + island:bill_length_mm + bill_length_mm:sex, data = d_train_ohneNA )
lm5 summary(lm5)
Call:
lm(formula = body_mass_g ~ . + flipper_length_mm:species + island:bill_depth_mm +
island:bill_length_mm + bill_length_mm:sex, data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-548.03 -110.28 9.74 105.87 655.87
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.244e+05 3.583e+04 6.263 2.67e-09 ***
ID 7.537e+00 5.164e-01 14.597 < 2e-16 ***
speciesChinstrap 2.685e+02 1.069e+03 0.251 0.802016
speciesGentoo 4.427e+02 1.174e+03 0.377 0.706417
islandDream 2.271e+03 6.069e+02 3.743 0.000244 ***
islandTorgersen 2.602e+03 7.904e+02 3.292 0.001194 **
bill_length_mm 3.488e+01 9.044e+00 3.857 0.000159 ***
bill_depth_mm 6.687e+01 2.612e+01 2.559 0.011300 *
flipper_length_mm 1.266e+01 3.591e+00 3.527 0.000532 ***
sexmale 2.192e+02 2.836e+02 0.773 0.440441
year -1.126e+02 1.791e+01 -6.287 2.35e-09 ***
speciesChinstrap:flipper_length_mm -2.326e+00 5.559e+00 -0.418 0.676155
speciesGentoo:flipper_length_mm -1.780e+00 5.584e+00 -0.319 0.750250
islandDream:bill_depth_mm -4.055e+01 3.473e+01 -1.167 0.244550
islandTorgersen:bill_depth_mm -7.696e+01 3.760e+01 -2.047 0.042098 *
islandDream:bill_length_mm -4.246e+01 1.302e+01 -3.262 0.001322 **
islandTorgersen:bill_length_mm -3.271e+01 1.426e+01 -2.294 0.022961 *
bill_length_mm:sexmale -1.485e+00 6.349e+00 -0.234 0.815364
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 183.4 on 181 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.9512, Adjusted R-squared: 0.9466
F-statistic: 207.5 on 17 and 181 DF, p-value: < 2.2e-16
sqrt(mean(lm5$residuals^2))
[1] 174.9401
<- lm(body_mass_g ~ species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + sex + year + species:flipper_length_mm + island:bill_depth_mm + island:bill_length_mm + bill_length_mm:sex, data = d_train_ohneNA)
lm6 summary(lm6)
Call:
lm(formula = body_mass_g ~ species + island + bill_length_mm +
bill_depth_mm + flipper_length_mm + sex + year + species:flipper_length_mm +
island:bill_depth_mm + island:bill_length_mm + bill_length_mm:sex,
data = d_train_ohneNA)
Residuals:
Min 1Q Median 3Q Max
-693.23 -164.70 4.26 168.33 789.05
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 129567.602 51852.791 2.499 0.01335 *
speciesChinstrap -698.016 1570.314 -0.445 0.65720
speciesGentoo 2115.001 1718.533 1.231 0.22002
islandDream 2633.265 892.266 2.951 0.00358 **
islandTorgersen 3324.224 1160.785 2.864 0.00468 **
bill_length_mm 52.471 13.190 3.978 0.00010 ***
bill_depth_mm 111.849 38.173 2.930 0.00382 **
flipper_length_mm 21.082 5.215 4.043 7.79e-05 ***
sexmale 1038.174 409.010 2.538 0.01198 *
year -66.761 25.948 -2.573 0.01088 *
speciesChinstrap:flipper_length_mm 2.959 8.162 0.363 0.71739
speciesGentoo:flipper_length_mm -6.595 8.203 -0.804 0.42249
islandDream:bill_depth_mm -58.793 51.077 -1.151 0.25121
islandTorgersen:bill_depth_mm -117.569 55.170 -2.131 0.03443 *
islandDream:bill_length_mm -43.172 19.154 -2.254 0.02539 *
islandTorgersen:bill_length_mm -33.477 20.985 -1.595 0.11238
bill_length_mm:sexmale -16.387 9.221 -1.777 0.07721 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 269.9 on 182 degrees of freedom
(6 Beobachtungen als fehlend gelöscht)
Multiple R-squared: 0.8938, Adjusted R-squared: 0.8844
F-statistic: 95.7 on 16 and 182 DF, p-value: < 2.2e-16
sqrt(mean(lm6$residuals^2))
[1] 258.1289
14.4 Nützliche Links
Linear Regression Summary in R