Descriptive statistics – Numeric variable

This post presents code to give the user a quick overview of a numeric variable with one function call. The code, which can easily be modified for your specific needs, currently includes information about the amount of missing data, mean and standard deviation (applicable when the distribution is normally distributed), median score and deciles, unique values of the variable, and the shape of the distribution.

The argument "unique=" takes a TRUE/FALSE value (FALSE is the default) and allows you to specify whether you want each value of the variable listed. This can be useful for identifying data entry errors but may produce too much output to meaningfully spot check when working with a large data set. 

The function produces two histograms to help you visualize the data. The first uses all non-missing data; the second omits extreme values and therefore allows you to zoom in on the bulk of the data. An isolated extreme value can compress the scale for the rest of the data, obscuring its shape, and the second histogram circumvents this problem. The argument "extreme=" allows you to set the number of standard deviations beyond which an observation might be considered extreme. The default is 3.

NOTE: The code below is a modified version of the original post. I modified it to facilitate looping through a list of variables, and include some code at the end of the post that demonstrates how to do this. For more information on loops, see this post.

# Generate fake data for illustration purposes
variable1 <- rnorm(100)
variable2 <- rnorm(100)
variable3 <- rnorm(100)
df <- cbind(variable1,variable2,variable3)
df <- as.data.frame(df)
remove(variable1,variable2,variable3)

#######################
# The function itself #
#######################

univariate <- function(data, var, unique=FALSE, extreme=3){

  cat("\n")
  print("**************************************************************************************", quote=FALSE)
  cat("\n")
  
  # Print variable name
  print(var, quote=FALSE)
  cat("\n")
  
  # Check for missing data
  nmiss <- sum(is.na(data[,var]))
  n <- length(data[,var])
  
  percent <- function(x, digits = 1, format = "f") {
    paste0(formatC(100 * x, format = format, digits = digits), "%")
  }
  
  percent.missing <- percent((nmiss)/n)
  missing <- paste("Missing observations:", nmiss)
  missing <- paste(missing, "of")
  missing <- paste(missing, n)
  missing <- paste(missing, "(")
  missing <- paste(missing, percent.missing)
  missing <- paste(missing, ")")
  print(missing, quote=FALSE)
  cat("\n")
  
  # Mean and standard deviation
  var.mean <- mean((data[,var]), na.rm=TRUE)
  mean.output <- paste("Mean =",var.mean)
  print(mean.output, quote=FALSE)
  
  median <- median((data[,var]), na.rm=TRUE)
  median.output <- paste("Median = ", median)
  print(median.output, quote=FALSE)
  
  var.sd <- sd((data[,var]), na.rm=TRUE)
  std.dev.output <- paste("Standard deviation =", var.sd)
  print(std.dev.output, quote=FALSE)  
  cat("\n")
  
  # Var scores deciles
  options(scipen=999)
  decile <- quantile((data[,var]), probs = seq(0, 1, 0.10), na.rm = TRUE,names = TRUE, type = 7)
  print("Deciles", quote=FALSE)
  print(decile)
  cat("\n")
  
  # Unique values of var - useful for identifying data entry errors, etc.
  var.values <- unique(data[,var])
  var.values <- var.values[order(var.values)]
  
  if (unique==TRUE) {
      print("Unique values", quote=FALSE)  
      print(var.values)    
      cat("\n")
  }
  
  
  #############
  # Histogram #
  #############
  
  # subset to remove NA values before histogram
  hist.data <- data[!is.na(data[,var]),]
  
  # obtain recommended binwidth
  recommended.binwidth <- diff(range((hist.data[,var]))/30)
  
  library(ggplot2)
  title <- paste(var, "Frequency Distribution")
  hist.graph <- ggplot(hist.data, aes_string(x=var)) + geom_histogram(binwidth=recommended.binwidth) + ggtitle(title)
  print(hist.graph)
  remove(recommended.binwidth)
  remove(hist.graph)
  
  
  ############################################
  # Histogram excluding extreme observations #
  ############################################
  
  # Extreme observations are considered to be those # standard deviations from the mean
  # Default is 3
  lower.limit <- var.mean - (var.sd*extreme)
  upper.limit <- var.mean + (var.sd*extreme)
  
  # Print limits
  print("Histogram Excluding Extreme Observations", quote=FALSE) 
  lower.limit.output <- paste("Extreme obs lower limit =", lower.limit)
  print(lower.limit.output, quote=FALSE)   
  upper.limit.output <- paste("Extreme obs upper limit =", upper.limit)
  print(upper.limit.output, quote=FALSE) 
  cat("\n")
  
  hist.data.2 <- subset(hist.data, hist.data[,var] > lower.limit & hist.data[,var] < upper.limit)
  
  unique.values <- unique(hist.data.2[,var])
  
  if (length(unique.values)==1 & unique.values[[1]]==0) {
    
    print("No histogram excluding extreme observations printed: All non-outlier values are zero", quote=FALSE)
    
  } else {
      
    # obtain recommended binwidth
    recommended.binwidth <- diff(range(unique.values, na.rm=TRUE))/30
    
    library(ggplot2)
    title <- paste(var, "Frequency Distribution\n(Excluding Extreme Obs)")
    hist.graph <- ggplot(hist.data.2, aes_string(x=var)) + geom_histogram(binwidth=recommended.binwidth) + ggtitle(title)
    print(hist.graph)
    remove(hist.data, hist.data.2, hist.graph)
    
  }
  
  cat("\n")
  print("**************************************************************************************", quote=FALSE)
  cat("\n")    

}


# Use the function for a single variable
univariate(data=df, var="variable1", extreme=2)
univariate(data=df, var="variable2", unique=TRUE)


##############################################################
# Apply function to a group of variables with minimal effort #
##############################################################

vars <- names(df)

for (i in 1:length(vars) ) {
  
  var <- vars[i]
  univariate(data=df, var=var)
  
}

remove(i,var,vars)
remove(df)