dplyr

General

  • Commas (,) are read as & in filter()
  • Pipe %>% to send result of one function to another one

Functions

Note: The dplyr equivalents from the base package base are only for illustration. The functions in either package may behave differently.

Create a data.frame

# dplyr
d <- data_frame(ID = 1:10)

# base
d <- data.frame(ID = 1:10)

See also as_data_frame() and as.data.frame().

filter()

Subset the data by rows given a condition.

# In dplyr
data %>% 
  filter(TIME > 0, MDV == 0)

# In base
data[data$TIME > 0 & data$MDV == 0,]

slice()

Subset the data by rows given their position.

# In dplyr
data %>% 
  slice(1:100)

# In base
data[1:100,]

select()

Subset the data by columns.

# In dplyr
data %>% 
  select(ID, TIME) # Select columns
data %>% 
  select(ID:MDV)   # Range of columns

# In base
data[,'ID']                           # Select columns
data[,which(colnames(data) == 'ID'):
      which(colnames(data) == 'MDV')] # Range of columns

group_by()

Define groups on the data that will later be used by other functions.

# In dplyr
data %>% 
  group_by(ID,OCC) %>% 
  ...

# In base
by(data = data, INDICES = data[,c('ID','OCC')], FUN = ... )
## or
aggregate(~ID+OCC, data = data , FUN = ...)

Important: if not needed anymore remove groups.

# In dplyr
data %>% 
  group_by(ID) %>% 
  ... %>% 
  ungroup()

summarize()

Summarise information on the dataset. Note the n() function is part of dplyr and allows to count the data as length would do in base.

# In dplyr
data %>% 
  group_by(ID) %>% 
  summarize(Cmax = max(DV),
            Nobs = n())

# In base
aggregate(DV~ID, data = data, FUN = max)    # For Cmax
aggregate(DV~ID, data = data, FUN = length) # For count

mutate()

Add or edit columns. See also transmute() do drop original columns and mutate_each() to apply a function on all columns.

# In dplyr
data %>% 
  mutate(TIME_H = TIME_D * 24)

# In base
data$TIME_H <- TIME_D*24

rename()

Rename column.

# In dplyr
data %>% 
  rename(TAD = TIME)

# In base
colnames(data)[colnames(data) == 'TIME'] <- 'TAD'

arrange()

Order rows.

# In dplyr
data %>% 
  arrange(ID, TIME)

# In base
data[order(data$ID, data$TIME),]

Use arrange(desc(VAR)) to sort in decreasing order.

join functions

full_join()

Merge and keep all rows from both dataset.

# In dplyr
data %>% 
  full_join(data2)

# In base
merge(data , data2, all = TRUE)

inner_join()

Merge only rows that are matched in both datasets.

# In dplyr
data %>% 
  inner_join(data2)

# In base
merge(data, data2, all = FALSE)

left_join()

Merge and keep all rows from the left dataset (i.e. x). See also right_join().

# In dplyr
data %>% 
  left_join(data2)

# In base
merge(data, data2, all.x = TRUE)

semi_join()

Subset x for all rows that match in y (no merge).

anti_join()

Subset x for all rows that do not match in y (no merge).

Miscellaneous functions

between(x, -1, 1)         # Shortcut for x >= -1 & x <= 1
do()                      # Use an outside function with group_by()
n() / n_distinct()        # Counter
fallwith()                # Assign default return value if function crashes
top_n()                   # Select top n rows
first() / last()          # First / Last row or value
nth()                     # n th row or value
lead() / lag()            # Values shifted by 1 (i.e use value from previous or next row)
glimpse()                 # Get a glimpse at the data
group_indices()           # Generate a unique id for each group
group_size() / n_groups() # Calculate group size
row_number()              # Widowed rank functions

Use dplyr in functions

# User side
group     <- c('VAR1', 'VAR')
col_name  <- 'DV'

# Function side
med <- dat %>% 
  group_by_(.dots = group) %>% 
  summarise_(MED = paste0('stats::median(', col_name, ')'))

results matching ""

    No results matching ""