20.4 Wrapping up everything you have learned in this course
The goal is to read data from the file, clean and explore data, and report in the files cleaned data and the results of descriptive statistics (number of observations, mean and SD) of measured variables affected by two drugs independently in women and men.
20.4.1 Read data from the file
system(command="svn export https://github.com/biocorecrg/CRG_RIntroduction/trunk/i_o_files")
conn <- file("./i_o_files/example_data.txt", "r")
DATA <- as.data.frame(read.table(conn, header = TRUE, sep = "\t"))
data <- DATA # we will save DATA to return to it if needed
head(data)
tail(data)
str(data)
dim(data)
20.4.2 Remove empty columns
# one of the ways to remove unwanted columns
x <- data
x <- x[, grep("X.[2-9]", colnames(x), inv = T)] # be careful here because we will need columns "X" and "X.1"
x <- x[, grep("X.1[0-9]", colnames(x), inv = T)]
colnames(x)
# more elegant (and correct) way that doesn't use grep and doesn't require knowledge of columns
apply(is.na(data), 2, all)
data <- data[, !apply(is.na(data), 2, all)]
colnames(data)
str(data)
20.4.3 Explore, rename and clean variables
# rename the colum for sex
colnames(data)[2] <- "SEX"
table(data$SEX)
# and remove rows with SEX other than M and F
data <- data[data$SEX %in% c("F", "M"), ]
# How many rows were removed?
# what is the data type of data$SEX?
class(data$SEX)
levels(data$SEX)
# remove unused levels
data$SEX <- droplevels(data$SEX)
str(data$SEX)
Do the same for the column containing information on drugs
colnames(data)
colnames(data)[colnames(data) == "X.1"] <- "DRUG"
data$DRUG
table(data$DRUG)
data <- data[data$DRUG %in% c("ART", "PRG"), ]
table(data$DRUG)
levels(data$DRUG)
data$DRUG <- droplevels(data$DRUG)
levels(data$DRUG)
20.4.4 Correct non-numeric values, changing them to NA
# you can correct columns one by one
x <- as.numeric(as.character(data$U_12))
str(x)
# etc....
# more elegant way to correct multiple columns at once
df <- data[, !colnames(data) %in% c("SEX", "DRUG")]
apply(df, 2, as.numeric)
x <- data.frame(apply(df, 2, as.numeric))
# well, we lost columns SEX and DRUGS, let's add them
df <- cbind(SEX=data$SEX, DRUG=data$DRUG, x)
data <- df # here is our clean dataset
str(data)
20.4.5 Write corrected data frame in the file
conn <- file("corrected_data.txt", "w")
write.table(data, conn, row.names = F, sep = "\t", quote = F)
close(conn)
# now let's read corrected data
conn <- file("corrected_data.txt", "r")
DATA <- as.data.frame(read.table(conn, header = TRUE, sep = "\t"))
close(conn)
all.equal(DATA, data)
data <- DATA # we will save DATA to return to it if needed
20.4.6 Explore and remove outliers
We are interested in 4 groups (by sex and drug) independently
summary(data)
#install.packages("ggplot2")
library(ggplot2)
for (num in 4:ncol(data)){ # skip columns SEX, DRUG, ID
print(colnames(data)[num])
p <- ggplot(data=data, aes(x = SEX, y = data[ ,num], col = DRUG))
p <- p + geom_boxplot() + ggtitle(paste(colnames(data)[num]))
print(p)
}
# Let's remove "obvious"" outliers by changing their values to NA
# data point below -20 and above 5 in A
data$A[data$A < -20] <- NA
data$A[data$A > 5] <- NA
# data point below -10 in I_6
data$I_6[data$I_6 <= -10] <- NA
# data point <-4 in S_6
data$S_6[data$S_6 < -4] <- NA
# data point <-9 in I_2
data$I_2[data$I_2 < -9] <- NA
# data point above 4 in S_2
data$S_2[data$S_2 > 4] <- NA
# data point above 10 in U_2
data$U_2[data$U_2 > 10] <- NA
# data point below 10 in D1
data$D1[data$D1 <= 10] <- NA
20.4.7 How to change a value of a specific data point
data[data$SEX=="M" & data$DRUG=="ART",]$E
# difficult way
df[df$SEX=="M" & df$DRUG=="ART", ]$E[df[df$SEX=="M" & df$DRUG=="ART", ]$E == 911] <- 500 # or NA
df[df$SEX=="M" & df$DRUG=="ART",]$E
# simpler way
df <- data
df[df$SEX=="M" & df$DRUG=="ART",]$E
df[df$SEX=="M" & df$DRUG=="ART" & df$E == 911 & !is.na(df$E), ]$E <- 500
df[df$SEX=="M" & df$DRUG=="ART",]$E
20.4.8 Make a data frame with statistical data
#let's make first a data frame for only one group
d <- data[data$SEX == "F" & data$DRUG == "ART",]
result <- list()
names <- c("N", "mean", "sd")
count <- 1
for (i in colnames(d)){
print(i)
if (i == "SEX" | i == "DRUG" | i == "ID") next;
x <- d[ ,i]
x <- x[!is.na(x)]
v <- c(i, length(x), mean(x), sd(x))
result[[count]] <- v
count <- count + 1
}
res <- data.frame(matrix(unlist(result), nrow=count-1, byrow=T))
sex <- "F"
drug <- "ART"
colnames(res) <- c("variable", paste(sex, drug, names,sep="_"))
# make a data frame for statistics for all groups, re-using the above for-loop
df <- data.frame()
names <- c("N", "mean", "sd")
for (sex in levels(data$SEX)){
for(drug in levels(data$DRUG)){
d <- data[data$SEX == sex & data$DRUG == drug,]
result <- list()
count <- 1
for (i in colnames(d)){
print(i)
if (i == "SEX" | i == "DRUG" | i == "ID") next;
x <- d[ ,i]
x <- x[!is.na(x)]
v <- c(i, length(x), mean(x), sd(x))
result[[count]] <- v
count <- count + 1
}
res <- data.frame(matrix(unlist(result), nrow=count-1, byrow=T), stringsAsFactors=FALSE)
colnames(res) <- c("variable", paste(sex, drug, names,sep="_"))
if (dim(df)[1] == 0) {
df <- res
}else{
df <- cbind.data.frame(df, res)
}
}
}
# simplify data frame
rownames(df) <- df$variable
df <- df[, colnames(df) != "variable"]
20.4.9 Format a data frame
# Let's round all values to two decimal digits
x <- lapply(df, function(x) round(as.numeric(x), digits = 2))
x
x <- data.frame(x) # we lost rownames!
rownames(x) <- rownames(df)
df <- x
20.4.10 Write a data frame in the file
conn <- file("Results.txt", "w")
write.table(df, conn, row.names = T, col.names = NA, sep = "\t")
close(conn)
conn <- file("Results.csv", "w")
write.table(df, conn, row.names = T, col.names = NA, sep = ",")
close(conn)
20.4.11 Write a table using as a decimal separator instead of a dot a comma
df_comma <- format(df, decimal.mark=",")
conn <- file("Results_comma.txt", "w")
write.table(df_comma, conn, row.names = T, col.names = NA, sep = "\t", quote = F)
close(conn)