Remember the basic rules of tidy data structure

  1. One column per type of information
  2. One row per observation
  3. One value in each cell
install.packages("tidyr")
library(tidyr)
library(dplyr)

Gather

Copy link to Western Ghats tree data from datasets page

raw_data = read.csv("http://esapubs.org/archive/ecol/E091/216/Macroplot_data_Rev.txt", sep = "\t")

View data

Lead discussion to correct structure

clean_data <- raw_data %>%
  gather(stem, girth, TreeGirth1:TreeGirth5)

View data

clean_data <- raw_data %>%
  gather(stem, girth, TreeGirth1:TreeGirth5) %>%
  filter(girth != 0)

Extract

clean_data <- raw_data %>%
  gather(stem, girth, TreeGirth1:TreeGirth5) %>%
  filter(girth != 0) %>%
  extract(stem, 'stem', 'TreeGirth(.)')

Separate

clean_data <- raw_data %>%
  gather(stem, girth, TreeGirth1:TreeGirth5) %>%
  filter(girth != 0) %>%
  extract(stem, 'stem', 'TreeGirth(.)') %>%
  separate(SpCode, c('genus', 'species'), 4)

Unite and Spread

stem_counts <- clean_data %>% 
  group_by(PlotID, genus, species) %>% 
  summarize(count = n())
stem_counts_wide <- stem_counts %>% 
  unite(species_id, genus, species)
stem_counts_wide <- stem_counts %>% 
  unite(species_id, genus, species) %>% 
  spread(species_id, count, fill = 0)