Reformat Time-Series Excel Table: Complete Code

This post, the last in a sequence of four, combines the code samples from the previous two posts and resolves the lingering issue that R interprets the column with the counts of active addresses as a character variable. In the code below, I set up the unique identifier for each zip code-parish combination and move the parish information into a variable before reshaping the data long, as this is more concise than trying to do these things afterwards, and leverages the unique identifier to reshape the data properly.

# data from http://www.datacenterresearch.org/data-resources/active-addresses-by-zip-code/

# Import file
library(XLConnect)
data <- readWorksheet(loadWorkbook("TheDataCenter_ActiveResidentialAddresses.xls"),sheet=1)

# R assigned nonsensical column names during the import process; remove them by setting them to placeholders.
columns <- dim(data)[2]
column.names <- character()

for (i in 1:columns) {

col.name <- paste("col", i, sep="")
column.names <- c(column.names, col.name)

}

remove(columns, i, col.name)
colnames(data) <- column.names

# Drop the rows from the dataset that do not contain data
data <- data[-c(1:3, 95:98),]
data <- data[rowSums(is.na(data))<ncol(data),]# Drop the intersperced blank rows.
rownames(data) <- NULL

# Create a vertical vector variable to represent years
years <- data[1,-1] # subset the row with the year data, minus the first column
years.t <- t(years) # transpose to obtain a vertical year vector
remove(years)
years.t <- as.data.frame(years.t)
library(plyr) # renaming variables, ddply function
years.t <- rename(years.t, c("1"="year"))
library(zoo) # na.locf function
years.t$year <- na.locf(years.t$year) # Carry the last observation forward for rows with missing year data
years.t$var <- rownames(years.t) # Need this as an identifier for merge, below.
rownames(years.t) <- NULL

# Create a vertical vector of months
months <- data[2,-1] # subset the row with the month data, minus the first column
months.t <- t(months)
remove(months)
months.t <- as.data.frame(months.t)
library(plyr) # renaming variables, ddply function
months.t <- rename(months.t, c("2"="month"))
months.t$var <- rownames(months.t) # Need this as an identifier for merge, below.
rownames(months.t) <- NULL

# Now drop the date rows, as R thinks they're observations
data <- data[-c(1,2),]

# Rename the first column to reflect the fact that it contains zip code data
data <- rename(data, c("col1"="zipcode"))

# The parish for each zip code was a section heading on a separate row in the Excel file.
# From R's perspective, this makes each parish just another observation.
# We need to change the parish rows from observations to a variable and assign the correct parish to each zipcode.

# Before doing this, we need to make sure that zipcode is a unique identifier and that no zipcodes appear in more than one parish.
zipcodes <- data$zipcode
duplicates <- zipcodes[duplicated(zipcodes)]
dup.zips <- subset(data, zipcode %in% duplicates)
table(dup.zips$zipcode)
remove(zipcodes, duplicates, dup.zips)

# This tells us that 70068 appears in two different parishes, although we don't know which ones from the data,
# which illustrates why parish needs to be a variable 

# 70068 appears twice, so we need to set up a unique identifier both in the primary data set,
# and in any data set we might merge into the primary set.

data$unique.ID <- rownames(data)
data$unique.ID <- as.integer(data$unique.ID)
rownames(data) <- NULL

data$parish.row <- ifelse(grepl("Parish", data$zipcode), data$zipcode, NA) 

library(zoo) # na.locf function
data$parish <- na.locf(data$parish.row) # "last observation carry forward" method.

# Check that the parish information was filled in
delete <- subset(data, select=c("zipcode","parish.row","parish"))
remove(delete)

# Delete parish rows, which are not unique observations but summaries for the zipcodes below
data <- subset(data, is.na(data$parish.row))
data$parish.row <- NULL

################
# Reshape long #
################

# Remove first column from column.names
column.names <- column.names[!is.element(column.names, "col1")]

# Expected number of rows in the new dataset
num.rows <- nrow(data) * length(column.names)

# We don't need to worry about the order of the column names here - we'll be merging the date information in by column,
# so the correct date information will still map to the correct population numbers.

library(reshape2)
data.long <- reshape(data,
varying = c(column.names),
v.names = "active_addresses",
timevar = "var",
idvar = c("unique.ID", "zipcode", "parish"),
times = c(column.names),
new.row.names = 1:num.rows,
direction = "long")
rownames(data.long) <- NULL
remove(num.rows, column.names)

data.long <- merge(data.long, years.t, by="var")
data.long <- merge(data.long, months.t, by="var")
remove(months.t, years.t)

# Create a date variable that R recognizes as a date
data.long$date <- paste(data.long$month, data.long$year, sep="-")
data.long$date <- as.Date(paste('01', data.long$date), format='%d %b-%Y')

R still thinks that both zipcode and active_addresses are character variables. This isn't a big problem for zip code, but we need to be able to quantify changes in active_addresses. To convert it to a numeric variable, we'll remove the commas from active_addresses and then as.numeric() will work.

# Convert active.addresses string variable to numbers
data.long$active_addresses <- as.numeric(gsub("," , "", data.long$active_addresses))

# Identify where the missing values are coming from
# St. Bernard Parish has missing values until Sept 2006 
missing <- subset(data.long, is.na(data.long$active_addresses))
remove(missing)

data.long$zipcode <- as.numeric(data.long$zipcode)

You may have noticed above that R thinks there are 14 different months, not 12, and considers year a factor. Neither of these is a problem. The 14 months are created because June and July are spelled with both the first three and with all four letters at different places in the data. Delightfully, the as.Date() function easily interprets both the three- and four-letter spellings, so the date variable still represents these months correctly. Nor is it a particular issue that year is a factor... Given that we have date to quantify the passage of time, we'd probably only analyze year if we were treating it as a factor variable of some sort (e.g., was there a significant difference in the number of active addresses between years 2014 and 2015?). If this bothers you, though, you can convert year from a factor to a number as follows:

data.long$year <-as.numeric(as.character(data.long$year))

data.long <- subset(data.long, select=c("parish", "unique.ID", "zipcode", "date", "year", "month", "active_addresses"))
data.long <- data.long[order(data.long$parish, data.long$unique.ID, data.long$zipcode, data.long$date),]
rownames(data.long) <- NULL