Descriptive statistics – Factor variable

This post presents a function designed to give the user a quick overview of a factor variable, including missing data, levels of the factor, and the frequency with which each level appears in the data, with which with one function call.

The enclosed code is an updated version of the original. It includes the "unique=" argument to allow the user to specify whether they want each level of the factor to be printed. This code also includes modifications to make looping through a list of variables easier. The enclosed code at the end of the post presents how one might quickly apply the function to a group of factor variables.

# Generate fake data for illustration purposes
variable1 <- sample(LETTERS[1:5], 10000, replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2) )
variable2 <- sample(LETTERS[1:5], 10000, replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2) )
variable3 <- sample(LETTERS[1:5], 10000, replace=TRUE, prob=c(0.2, 0.2, 0.2, 0.2, 0.2) )
df <- as.data.frame(cbind(variable1, variable2, variable3))
remove(variable1, variable2, variable3)


#######################
# The function itself #
#######################

uni.factor <- function(data, var, unique=TRUE){
  
  cat("\n")
  print("**************************************************************************************", quote=FALSE)
  cat("\n")
  
  # Print variable name
  print(var, quote=FALSE)
  cat("\n")
  
  # Check for missing data
  nmiss <- sum(is.na(data[,var]))
  n <- length(data[,var])
  
  percent <- function(x, digits = 0, format = "f") {
    paste0(formatC(100 * x, format = format, digits = digits), "%")
  }
  
  percent.missing <- percent((nmiss)/n)
  missing <- paste("Missing observations:", nmiss)
  missing <- paste(missing, "of")
  missing <- paste(missing, n)
  missing <- paste(missing, "(")
  missing <- paste(missing, percent.missing)
  missing <- paste(missing, ")")
  print(missing, quote=FALSE)
  cat("\n")
    
    # Ensure that R knows the factor variable is factor, and not character
  data[,var] <- as.factor(data[,var])
    
    # Frequency table
    freq.table <- cbind(Count=table(data[,var]), Proportion=prop.table(table(data[,var])))
  
  # Frequency table
  freq.table <- cbind(Count=table(data[,var]), Proportion=prop.table(table(data[,var])))
  
  # Totals row
  table.totals <- colSums(freq.table)
  
  table.totals <- t(table.totals)
  table.totals <- as.data.frame(table.totals)
  table.totals[,var] <- "Total"
  
  freq.table <- as.data.frame(freq.table)
  freq.table[,var] <- rownames(freq.table)
  freq.table$Proportion <- round(freq.table$Proportion, digits=2)
  
  freq.table <- rbind(freq.table,table.totals)
  remove(table.totals)
  
  freq.table <- subset(freq.table, select=c(var, "Count", "Proportion"))
  rownames(freq.table) <- NULL
  
  print("Frequency Table", quote=FALSE)
  print(freq.table)
    
  # Unique values of var - useful for identifying data entry errors, etc.
  var.values <- unique(data[,var])
  var.values <- var.values[order(var.values)]
  
  if (unique==TRUE) {
      cat("\n") 
      print("Unique values", quote=FALSE)  
      print(var.values)    
  }
    
  cat("\n")
  print("**************************************************************************************", quote=FALSE)
  cat("\n")    
    
}

# Use the function for a single variable
uni.factor(data=df, var="variable3")
uni.factor(data=df, var="variable3", unique=FALSE)


##############################################################
# Apply function to a group of variables with minimal effort #
##############################################################

vars <- names(df)

for (i in 1:length(vars) ) {
  
  var <- vars[i]
  uni.factor(data=df, var=var)
  
}

remove(i,var,vars)
remove(df)