20.2 Descriptive Statistics
20.2.1 Functions: mean, sd, var, min, max, median, range, IQR, quantile.
All these function require special treatment for missing values, using parameter na.rm = TRUE.
min(data$repwt)
?min
min(data$repwt, na.rm = TRUE)
max(data$repwt, na.rm = TRUE)
range(data$repwt, na.rm = TRUE)
range(data$repwt, na.rm = TRUE)[1] == min(data$repwt, na.rm = TRUE)
range(data$repwt, na.rm = TRUE)[2] == max(data$repwt, na.rm = TRUE)
mean(data$repwt, na.rm = TRUE)
median(data$repwt, na.rm = TRUE)
quantile(data$repwt, na.rm = TRUE) # shows quantiles for values in a specific column, ignoring 'NA'
quantile(data$repwt, na.rm = TRUE, probs = c(0, 0.25, 0.5, 0.75, 1)) # standard quantiles (or quartiles)
quantile(data$repwt, na.rm = TRUE, probs = c(0, 0.1, 0.9, 1)) # any other quantiles
IQR(data$repwt, na.rm = TRUE)
IQR(data$repwt, na.rm = TRUE) == quantile(data$repwt, na.rm = TRUE)[4] - quantile(data$repwt, na.rm = TRUE)[2]
# contigency table of quantiles of weight versus quantiles of repwt
x <- data$weight
any(is.na(x))
y <- data$repwt
any(is.na(y))
table(cut(x, quantile(x)), cut(y, quantile(y, na.rm = TRUE)))
20.2.2 Functions tapply() and round()
var(data$repwt, na.rm = TRUE)
?var
sd(data$repwt, na.rm = TRUE)
sd(data$repwt, na.rm = TRUE) == sqrt(var(data$repwt, na.rm = TRUE))
sd(data$repwt, na.rm = TRUE) ** 2 == var(data$repwt, na.rm = TRUE)
# function tapply()
?tapply
tapply(data$weight, data$sex, mean)
tapply(data$weight, data$sex, quantile)
tapply(data$weight, data$sex, quantile)$F
20.2.3 Excercises
- How many women have weight below (or equal) median AND height above median?
d <- data[data$sex == "F", ]
# strightforward solution
nrow(d[d$weight <= median(d$weight) & d$height > median(d$height), ])
# solution using table, cut and quantile
x <- d$weight
y <- d$height
table(cut(x, quantile(x)), cut(y, quantile(y, na.rm = TRUE)))
z <- table(cut(x, quantile(x, probs = c(0, 0.5, 1))), cut(y, quantile(y, probs = c(0, 0.5, 1))))
z[3]
How many women have weight in the lower 5% quantile AND height in the upper 5% quantile?
What is the difference of the mean and sd values of the reported height between men and women (round reported values to one digit)?
20.2.4 Data visualization: boxplot()
boxplot(data$weight)
y <- median(data$weight)
segments(0, y, 1, y, lwd = 3, col = "red")
y <- mean(data$weight)
segments(0, y, 1, y, lwd = 3, col = "blue")
y <- mean(data$weight) - sd(data$weight)
segments(0, y, 1, y, lwd = 3, col = "green")
y <- mean(data$weight) + sd(data$weight)
segments(0, y, 1, y, lwd = 3, col = "green")
y <- quantile(data$weight)[1]
segments(0, y, 1, y, lwd = 3, col = "orange")
y <- quantile(data$weight)[2]
segments(0, y, 1, y, lwd = 3, col = "magenta")
y <- quantile(data$weight)[4]
segments(0, y, 1, y, lwd = 3, col = "cyan")
y <- quantile(data$weight)[5]
segments(0, y, 1, y, lwd = 3, col = "orange")
20.2.5 Outliers
Can be defined (by the Tuley’s test) as values in the sample that differ from the Q1 (25% quantile) or Q3 (75% quantile) by more than 1.5 x IQR (where IQR = Q3 - Q1). However, there is no formal definition of outliers; they need to be treated subjectively.
quantile(data$weight)
lower_limit <- quantile(data$weight)[2] - 1.5 * IQR(data$weight) # values below this limit are outliers
lower_limit
min(data$weight) < lower_limit # Is the minimum weight an outlier?
lower_whisker <- max(min(data$weight), lower_limit)
lower_whisker
boxplot(data$weight)
y <- lower_whisker
segments(0, y, 1, y, lwd = 3, col = "black")
upper_limit <- quantile(data$weight)[4] + 1.5 * IQR(data$weight) # values above this limit are outliers
upper_limit
max(data$weight) > upper_limit # Is the maximum weight an outlier? TRUE or FALSE?
upper_whisker = min(max(data$weight), upper_limit)
upper_whisker
y <- upper_whisker
segments(0, y, 1, y, lwd = 3, col = "black")
20.2.6 How statistics change if to remove outliers
# We can remove all "formal" outliers
d <- data[data$weight <= upper_whisker, ]
# While it is better to remove only those that have no sense
d <- data[data$weight != max(data$weight), ]
# Did mean change?
mean(d$weight)
mean(data$weight)
# Did median change?
median(data$weight)
median(d$weight)
# Did SD change?
sd(data$weight)
sd(d$weight)
boxplot(d$weight)