20.4 Wrapping up everything you have learned in this course

The goal is to read data from the file, clean and explore data, and report in the files cleaned data and the results of descriptive statistics (number of observations, mean and SD) of measured variables affected by two drugs independently in women and men.

20.4.1 Read data from the file

system(command="svn export https://github.com/biocorecrg/CRG_RIntroduction/trunk/i_o_files")

conn <- file("./i_o_files/example_data.txt", "r")
DATA <- as.data.frame(read.table(conn, header = TRUE, sep = "\t"))
data <- DATA  # we will save DATA to return to it if needed

head(data)
tail(data)

str(data) 
dim(data)

20.4.2 Remove empty columns

# one of the ways to remove unwanted columns
x <- data
x <- x[, grep("X.[2-9]", colnames(x), inv = T)] # be careful here because we will need columns "X" and "X.1"
x <- x[, grep("X.1[0-9]", colnames(x), inv = T)]
colnames(x)

# more elegant (and correct) way that doesn't use grep and doesn't require knowledge of columns
apply(is.na(data), 2, all)

data <- data[, !apply(is.na(data), 2, all)]

colnames(data)

str(data)

20.4.3 Explore, rename and clean variables

# rename the colum for sex
colnames(data)[2] <- "SEX"

table(data$SEX)
# and remove rows with SEX other than M and F
data <- data[data$SEX %in% c("F", "M"), ]

# How many rows were removed?

# what is the data type of data$SEX?

class(data$SEX)
levels(data$SEX)

# remove unused levels
data$SEX <- droplevels(data$SEX)
str(data$SEX)

Do the same for the column containing information on drugs

colnames(data)
colnames(data)[colnames(data) == "X.1"] <- "DRUG"

data$DRUG
table(data$DRUG)

data <- data[data$DRUG %in% c("ART", "PRG"), ]
table(data$DRUG)
levels(data$DRUG)
data$DRUG <- droplevels(data$DRUG)
levels(data$DRUG)

20.4.4 Correct non-numeric values, changing them to NA

# you can correct columns one by one
x <- as.numeric(as.character(data$U_12))
str(x)
# etc....

# more elegant way to correct multiple columns at once
df <- data[, !colnames(data) %in% c("SEX", "DRUG")]

apply(df, 2, as.numeric)

x <- data.frame(apply(df, 2, as.numeric))
# well, we lost columns SEX and DRUGS, let's add them

df <- cbind(SEX=data$SEX, DRUG=data$DRUG, x)

data <- df # here is our clean dataset
str(data)

20.4.5 Write corrected data frame in the file

conn <- file("corrected_data.txt", "w")
write.table(data, conn, row.names = F, sep = "\t", quote = F)
close(conn)

# now let's read corrected data
conn <- file("corrected_data.txt", "r")
DATA <- as.data.frame(read.table(conn, header = TRUE, sep = "\t"))
close(conn)

all.equal(DATA, data)
data <- DATA  # we will save DATA to return to it if needed

20.4.6 Explore and remove outliers

We are interested in 4 groups (by sex and drug) independently

summary(data)

#install.packages("ggplot2")
library(ggplot2) 

for (num in 4:ncol(data)){ # skip columns SEX, DRUG, ID
  print(colnames(data)[num])
  p <- ggplot(data=data, aes(x = SEX, y = data[ ,num], col = DRUG))  
  p <- p + geom_boxplot() + ggtitle(paste(colnames(data)[num]))
  print(p)
}

# Let's remove "obvious"" outliers by changing their values to NA

# data point below -20 and above 5 in A
data$A[data$A < -20] <- NA
data$A[data$A > 5] <- NA

# data point below -10 in I_6
data$I_6[data$I_6 <= -10] <- NA 

# data point <-4 in S_6
data$S_6[data$S_6 < -4] <- NA

# data point <-9 in I_2
data$I_2[data$I_2 < -9] <- NA

# data point above 4 in S_2
data$S_2[data$S_2 > 4] <- NA

# data point above 10 in U_2
data$U_2[data$U_2 > 10] <- NA

# data point below 10 in D1
data$D1[data$D1 <= 10] <- NA

20.4.7 How to change a value of a specific data point

data[data$SEX=="M" & data$DRUG=="ART",]$E

# difficult way
df[df$SEX=="M" & df$DRUG=="ART", ]$E[df[df$SEX=="M" & df$DRUG=="ART", ]$E == 911] <- 500 # or NA
df[df$SEX=="M" & df$DRUG=="ART",]$E

# simpler way
df <- data
df[df$SEX=="M" & df$DRUG=="ART",]$E
df[df$SEX=="M" & df$DRUG=="ART" & df$E == 911 & !is.na(df$E), ]$E <- 500
df[df$SEX=="M" & df$DRUG=="ART",]$E

20.4.8 Make a data frame with statistical data

#let's make first a data frame for only one group
d <- data[data$SEX == "F" & data$DRUG == "ART",]

result <- list()
names <- c("N", "mean", "sd")
count <- 1
for (i in colnames(d)){
  print(i)
  if (i == "SEX" | i == "DRUG" | i == "ID") next;
  x <- d[ ,i]
  x <- x[!is.na(x)]
  v <- c(i, length(x), mean(x), sd(x))
  result[[count]] <- v
  count <- count + 1
}

res <- data.frame(matrix(unlist(result), nrow=count-1, byrow=T))
sex <- "F"
drug <- "ART"
colnames(res) <- c("variable", paste(sex, drug, names,sep="_"))

# make a data frame for statistics for all groups, re-using the above for-loop
df <- data.frame()
names <- c("N", "mean", "sd")
for (sex in levels(data$SEX)){ 
  for(drug in levels(data$DRUG)){
    d <- data[data$SEX == sex & data$DRUG == drug,]
    result <- list()
    count <- 1
    for (i in colnames(d)){
      print(i)
      if (i == "SEX" | i == "DRUG" | i == "ID") next;
      x <- d[ ,i]
      x <- x[!is.na(x)]
      v <- c(i, length(x), mean(x), sd(x))
      result[[count]] <- v
      count <- count + 1
    }
    res <- data.frame(matrix(unlist(result), nrow=count-1, byrow=T), stringsAsFactors=FALSE)
    colnames(res) <- c("variable", paste(sex, drug, names,sep="_"))
    
    if (dim(df)[1] == 0) {
      df <- res
    }else{
      df <- cbind.data.frame(df, res)
    }
  }
}

# simplify data frame
rownames(df) <- df$variable
df <- df[, colnames(df) != "variable"]

20.4.9 Format a data frame

# Let's round all values to two decimal digits
x <- lapply(df, function(x) round(as.numeric(x), digits = 2))
x
x <- data.frame(x) # we lost rownames!
rownames(x) <- rownames(df)
df <- x

20.4.10 Write a data frame in the file

conn <- file("Results.txt", "w")
write.table(df, conn, row.names = T, col.names = NA, sep = "\t")
close(conn)

conn <- file("Results.csv", "w")
write.table(df, conn, row.names = T, col.names = NA, sep = ",")
close(conn)

20.4.11 Write a table using as a decimal separator instead of a dot a comma

df_comma <- format(df, decimal.mark=",")

conn <- file("Results_comma.txt", "w")
write.table(df_comma, conn, row.names = T, col.names = NA, sep = "\t", quote = F)
close(conn)