NexusCS

R

Languages
Quick reference for R - a language and environment for statistical computing, data analysis, and visualization widely used in data science.
data-science
statistics
programming

Getting started

Introduction

R is a language for statistical computing and data analysis. Widely used in data science, research, and visualization.

Installation

# Install R
brew install r              # macOS
sudo apt install r-base     # Ubuntu

# Install RStudio (IDE)
# Download from rstudio.com

Quick Example

# Load data
data <- read.csv("data.csv")

# Analyze
summary(data)
mean(data$value)

# Visualize
plot(data$x, data$y)

Basic Syntax

# Assignment
x <- 5              # Standard R style
x = 5               # Also works
5 -> x              # Right assignment

# Comments
# This is a comment

# Print
print(x)
cat("Value:", x)

Data Types

Vectors

# Numeric vector
nums <- c(1, 2, 3, 4, 5)

# Integer vector
ints <- c(1L, 2L, 3L)

# Logical vector
bools <- c(TRUE, FALSE, TRUE)

# Character vector
chars <- c("a", "b", "c")

# Sequences
1:10                # 1 2 3 ... 10
seq(1, 10, 2)       # 1 3 5 7 9
rep(1, 5)           # 1 1 1 1 1

Lists

# Create list
lst <- list(
  name = "John",
  age = 30,
  scores = c(85, 90, 95)
)

# Access elements
lst$name            # "John"
lst[[1]]            # "John"
lst[["age"]]        # 30

# Named vs unnamed
list(a = 1, b = 2)  # Named
list(1, 2, 3)       # Unnamed

Data Frames

# Create data frame
df <- data.frame(
  name = c("Alice", "Bob", "Carol"),
  age = c(25, 30, 35),
  score = c(85, 90, 95)
)

# Access columns
df$name             # Column vector
df[["age"]]         # Column vector
df[, "score"]       # Column vector

# Access rows
df[1, ]             # First row
df[1:2, ]           # First two rows

# Dimensions
nrow(df)            # Number of rows
ncol(df)            # Number of columns
dim(df)             # c(rows, cols)

Matrices

# Create matrix
m <- matrix(1:9, nrow = 3, ncol = 3)

# By row instead of column
m <- matrix(1:9, nrow = 3, byrow = TRUE)

# Access elements
m[1, 2]             # Row 1, col 2
m[1, ]              # Row 1
m[, 2]              # Column 2

# Matrix operations
t(m)                # Transpose
m %*% m             # Matrix multiplication

Factors

# Create factor
gender <- factor(c("M", "F", "F", "M"))

# With levels
sizes <- factor(
  c("S", "M", "L", "M"),
  levels = c("S", "M", "L", "XL")
)

# Ordered factor
rating <- factor(
  c("low", "high", "medium"),
  levels = c("low", "medium", "high"),
  ordered = TRUE
)

# Work with factors
levels(gender)      # Get levels
relevel(gender, ref = "F")  # Change reference

Subsetting

Vector Subsetting

x <- c(10, 20, 30, 40, 50)

# Positive indices
x[1]                # 10
x[c(1, 3, 5)]       # 10 30 50
x[1:3]              # 10 20 30

# Negative indices (exclude)
x[-1]               # 20 30 40 50
x[-c(1, 5)]         # 20 30 40

# Logical subsetting
x[x > 25]           # 30 40 50
x[x %% 20 == 0]     # 20 40

# Named vectors
y <- c(a = 1, b = 2, c = 3)
y["b"]              # 2
y[c("a", "c")]      # 1 3

Data Frame Subsetting

df <- data.frame(
  x = 1:5,
  y = letters[1:5],
  z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)

# Single bracket (returns data frame)
df[1:2, ]           # First 2 rows
df[, 1:2]           # First 2 columns
df[df$x > 2, ]      # Rows where x > 2

# Double bracket (returns vector)
df[[1]]             # First column as vector

# Dollar sign (returns vector)
df$x                # x column as vector

# subset() function
subset(df, x > 2)   # Rows where x > 2
subset(df, x > 2, select = c(x, y))  # Select columns

List Subsetting

lst <- list(
  a = 1:3,
  b = "text",
  c = list(x = 10, y = 20)
)

# Single bracket (returns list)
lst[1]              # list(a = 1:3)
lst["a"]            # list(a = 1:3)

# Double bracket (returns element)
lst[[1]]            # 1:3
lst[["a"]]          # 1:3

# Dollar sign
lst$a               # 1:3

# Nested lists
lst$c$x             # 10
lst[[c(3, 1)]]      # 10

Functions

Function Definition

# Basic function
square <- function(x) {
  return(x^2)
}

# Implicit return (last expression)
square <- function(x) {
  x^2
}

# Multiple arguments
power <- function(x, n) {
  x^n
}

# Default arguments
power <- function(x, n = 2) {
  x^n
}

Function Arguments

# Named arguments
power(x = 5, n = 3)
power(n = 3, x = 5)  # Order doesn't matter

# Partial matching
power(x = 5, n = 3)
power(5, n = 3)      # x matched by position

# ... (dot-dot-dot)
myprint <- function(...) {
  args <- list(...)
  print(args)
}

Anonymous Functions

# Traditional
sapply(1:5, function(x) x^2)

# R 4.1+ shorthand
sapply(1:5, \(x) x^2)

# With multiple arguments
mapply(\(x, y) x + y, 1:3, 4:6)

Control Flow

Conditionals

# if/else
if (x > 0) {
  print("positive")
} else if (x < 0) {
  print("negative")
} else {
  print("zero")
}

# ifelse (vectorized)
ifelse(x > 0, "pos", "neg")

# switch
result <- switch(
  type,
  "a" = 1,
  "b" = 2,
  "c" = 3,
  NA  # default
)

Loops

# for loop
for (i in 1:10) {
  print(i)
}

# for with vectors
fruits <- c("apple", "banana", "orange")
for (fruit in fruits) {
  print(fruit)
}

# while loop
i <- 1
while (i <= 10) {
  print(i)
  i <- i + 1
}

# repeat loop
i <- 1
repeat {
  print(i)
  i <- i + 1
  if (i > 10) break
}

# next (continue)
for (i in 1:10) {
  if (i %% 2 == 0) next
  print(i)  # Only odd numbers
}

Apply Family

apply()

# Apply over matrix/array
m <- matrix(1:12, nrow = 3)

# Apply to rows (margin = 1)
apply(m, 1, sum)    # Row sums

# Apply to columns (margin = 2)
apply(m, 2, mean)   # Column means

# Custom function
apply(m, 1, function(x) max(x) - min(x))

lapply() & sapply()

# lapply returns list
nums <- list(a = 1:3, b = 4:6, c = 7:9)
lapply(nums, sum)   # list(a = 6, b = 15, c = 24)

# sapply simplifies to vector/matrix
sapply(nums, sum)   # c(a = 6, b = 15, c = 24)

# On vectors
sapply(1:5, function(x) x^2)  # 1 4 9 16 25

# vapply (with explicit type)
vapply(nums, sum, numeric(1))

tapply()

# Apply by groups
scores <- c(85, 90, 75, 80, 95, 70)
groups <- c("A", "B", "A", "B", "A", "B")

tapply(scores, groups, mean)
# A    B
# 83.3 80.0

# Multiple grouping factors
tapply(scores, list(groups, c(1,1,2,2,1,2)), mean)

Other Apply Functions

# mapply (multivariate)
mapply(function(x, y) x + y, 1:3, 4:6)  # 5 7 9

# rapply (recursive for lists)
lst <- list(a = 1:3, b = list(c = 4:6))
rapply(lst, sum, how = "list")

dplyr Package

Basic Verbs

library(dplyr)

# filter() - subset rows
filter(df, age > 25)
filter(df, age > 25 & score < 90)

# select() - subset columns
select(df, name, age)
select(df, -score)           # Exclude
select(df, starts_with("a"))

# mutate() - add/modify columns
mutate(df, age_double = age * 2)
mutate(df, age = age + 1)    # Modify

# arrange() - sort rows
arrange(df, age)             # Ascending
arrange(df, desc(age))       # Descending

# summarise() - aggregate
summarise(df, mean_age = mean(age))

Grouping & Piping

# group_by() + summarise()
df %>%
  group_by(department) %>%
  summarise(
    mean_age = mean(age),
    count = n()
  )

# Pipe operator
df %>%
  filter(age > 25) %>%
  select(name, score) %>%
  arrange(desc(score))

# Native pipe (R 4.1+)
df |>
  filter(age > 25) |>
  mutate(high_score = score > 90)

Other dplyr Functions

# count() - count unique values
count(df, department)

# distinct() - unique rows
distinct(df, department)

# slice() - select rows by position
slice(df, 1:5)
slice_head(df, n = 3)
slice_tail(df, n = 3)
slice_max(df, order_by = score, n = 5)

# rename() - rename columns
rename(df, full_name = name)

# relocate() - reorder columns
relocate(df, score, .before = age)

Data Import/Export

CSV Files

# Read CSV
df <- read.csv("data.csv")
df <- read.csv("data.csv", header = TRUE)
df <- read.csv("data.csv", stringsAsFactors = FALSE)

# Write CSV
write.csv(df, "output.csv")
write.csv(df, "output.csv", row.names = FALSE)

# readr package (faster, better defaults)
library(readr)
df <- read_csv("data.csv")
write_csv(df, "output.csv")

R Data Files

# Save single object
saveRDS(df, "data.rds")
df <- readRDS("data.rds")

# Save multiple objects
save(df1, df2, file = "data.RData")
load("data.RData")  # Objects restored with original names

# Save workspace
save.image("workspace.RData")
load("workspace.RData")

Other Formats

# Excel (readxl package)
library(readxl)
df <- read_excel("data.xlsx")
df <- read_excel("data.xlsx", sheet = "Sheet1")

# JSON (jsonlite package)
library(jsonlite)
data <- fromJSON("data.json")
write_json(df, "output.json")

# Database (DBI package)
library(DBI)
con <- dbConnect(RSQLite::SQLite(), "database.db")
df <- dbReadTable(con, "table_name")
dbDisconnect(con)

Data Manipulation

Base R Functions

# subset()
subset(df, age > 25)
subset(df, age > 25 & score < 90)
subset(df, age > 25, select = c(name, score))

# merge() - join data frames
df1 <- data.frame(id = 1:3, name = c("A", "B", "C"))
df2 <- data.frame(id = 2:4, score = c(80, 90, 95))

merge(df1, df2, by = "id")              # Inner join
merge(df1, df2, by = "id", all = TRUE)  # Outer join
merge(df1, df2, by = "id", all.x = TRUE) # Left join

# aggregate()
aggregate(score ~ department, data = df, mean)
aggregate(cbind(age, score) ~ department, data = df, mean)

Data Reshaping

# reshape2 package
library(reshape2)

# Wide to long
df_long <- melt(df, id.vars = "id")

# Long to wide
df_wide <- dcast(df_long, id ~ variable)

# tidyr package
library(tidyr)

# pivot_longer (wide to long)
df_long <- pivot_longer(df, cols = -id)

# pivot_wider (long to wide)
df_wide <- pivot_wider(df_long, names_from = name, values_from = value)

Combining Data

# rbind() - combine rows
df3 <- rbind(df1, df2)

# cbind() - combine columns
df3 <- cbind(df1, df2)

# bind_rows() (dplyr)
bind_rows(df1, df2)

# bind_cols() (dplyr)
bind_cols(df1, df2)

Statistics

Descriptive Statistics

# Central tendency
mean(x)             # Mean
median(x)           # Median
mode(x)             # Mode (not built-in, use table)

# Variability
sd(x)               # Standard deviation
var(x)              # Variance
IQR(x)              # Interquartile range
range(x)            # Min and max
diff(range(x))      # Range value

# Summary
summary(x)          # Five-number summary + mean
quantile(x)         # Quartiles
quantile(x, probs = c(0.25, 0.75))

Statistical Tests

# t-test
t.test(x)           # One-sample
t.test(x, y)        # Two-sample
t.test(x ~ group, data = df)  # Formula notation

# Chi-square test
chisq.test(table(x, y))

# Correlation
cor(x, y)           # Pearson
cor(x, y, method = "spearman")
cor.test(x, y)      # With significance

# Linear regression
model <- lm(y ~ x, data = df)
summary(model)
predict(model, newdata)

Distributions

# Normal distribution
rnorm(100)          # Random samples
dnorm(x)            # Density
pnorm(x)            # Cumulative probability
qnorm(p)            # Quantile

# Other distributions
runif(100)          # Uniform
rpois(100, lambda = 5)  # Poisson
rbinom(100, size = 10, prob = 0.5)  # Binomial

Plotting

Base Graphics

# Scatter plot
plot(x, y)
plot(x, y, main = "Title", xlab = "X", ylab = "Y")
plot(x, y, col = "blue", pch = 19)

# Line plot
plot(x, y, type = "l")
plot(x, y, type = "b")  # Both points and lines

# Histogram
hist(x)
hist(x, breaks = 20, col = "lightblue")

# Boxplot
boxplot(x)
boxplot(score ~ department, data = df)

# Barplot
barplot(table(x))
barplot(counts, names.arg = labels)

ggplot2 Basics

library(ggplot2)

# Basic template
ggplot(df, aes(x = x, y = y)) +
  geom_point()

# Scatter plot
ggplot(df, aes(x = age, y = score)) +
  geom_point(aes(color = department))

# Line plot
ggplot(df, aes(x = time, y = value)) +
  geom_line()

# Bar plot
ggplot(df, aes(x = category)) +
  geom_bar()

# Histogram
ggplot(df, aes(x = value)) +
  geom_histogram(bins = 30)

# Boxplot
ggplot(df, aes(x = group, y = value)) +
  geom_boxplot()

ggplot2 Customization

# Faceting
ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  facet_wrap(~ category)

# Themes
ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  theme_minimal()

# Labels
ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  labs(
    title = "Title",
    x = "X Label",
    y = "Y Label"
  )

# Scales
ggplot(df, aes(x = x, y = y, color = z)) +
  geom_point() +
  scale_color_gradient(low = "blue", high = "red")

String Manipulation

Basic String Operations

# Concatenate
paste("Hello", "World")         # "Hello World"
paste0("Hello", "World")        # "HelloWorld"
paste(c("a", "b"), collapse = ",")  # "a,b"

# Substring
substr("Hello World", 1, 5)     # "Hello"
substring("Hello World", 7)     # "World"

# String length
nchar("Hello")                  # 5

# Case conversion
toupper("hello")                # "HELLO"
tolower("HELLO")                # "hello"

Pattern Matching

# grep() - find matches
grep("pattern", c("text", "pattern", "other"))  # 2
grep("pattern", strings, value = TRUE)  # Return values

# grepl() - logical vector
grepl("pattern", strings)       # TRUE/FALSE vector

# sub() & gsub() - replace
sub("old", "new", "old text old")    # Replace first
gsub("old", "new", "old text old")   # Replace all

# Regular expressions
gsub("[0-9]", "", "abc123")     # "abc"
gsub("\\s+", " ", "a  b   c")   # "a b c"

stringr Package

library(stringr)

# Detect
str_detect(strings, "pattern")

# Extract
str_extract(strings, "pattern")
str_extract_all(strings, "pattern")

# Replace
str_replace(strings, "old", "new")
str_replace_all(strings, "old", "new")

# Split
str_split(strings, ",")

# Trim whitespace
str_trim("  hello  ")           # "hello"

Package Management

Installing Packages

# Install from CRAN
install.packages("dplyr")
install.packages(c("dplyr", "ggplot2"))

# Install from GitHub
install.packages("devtools")
devtools::install_github("user/repo")

# Install from Bioconductor
if (!requireNamespace("BiocManager"))
  install.packages("BiocManager")
BiocManager::install("package")

Loading Packages

# library() - load package
library(dplyr)

# require() - load with check
if (require(dplyr)) {
  # dplyr is available
}

# Namespace access (no loading)
dplyr::filter(df, age > 25)

# Unload package
detach("package:dplyr", unload = TRUE)

Managing Packages

# List installed packages
installed.packages()

# Update packages
update.packages()

# Remove package
remove.packages("dplyr")

# Check if installed
"dplyr" %in% installed.packages()

Operators

Arithmetic Operators

x + y               # Addition
x - y               # Subtraction
x * y               # Multiplication
x / y               # Division
x ^ y               # Exponentiation
x %% y              # Modulo
x %/% y             # Integer division

Comparison Operators

x == y              # Equal
x != y              # Not equal
x > y               # Greater than
x < y               # Less than
x >= y              # Greater or equal
x <= y              # Less or equal

Logical Operators

x & y               # Element-wise AND
x | y               # Element-wise OR
!x                  # NOT
x && y              # Scalar AND
x || y              # Scalar OR
xor(x, y)           # Exclusive OR

Special Operators

x %in% y            # Match
x %*% y             # Matrix multiplication
x %>% f()           # Pipe (dplyr)
x |> f()            # Native pipe (R 4.1+)

Working Directory & Files

Directory Operations

# Get working directory
getwd()

# Set working directory
setwd("/path/to/directory")

# List files
list.files()
list.files(pattern = "\\.csv$")
dir()               # Alias for list.files()

# File info
file.exists("file.txt")
file.info("file.txt")

File Operations

# Create directory
dir.create("new_folder")

# Copy file
file.copy("from.txt", "to.txt")

# Rename/move file
file.rename("old.txt", "new.txt")

# Delete file
file.remove("file.txt")

# Path operations
file.path("folder", "file.txt")  # OS-independent paths
basename("/path/to/file.txt")    # "file.txt"
dirname("/path/to/file.txt")     # "/path/to"

Gotchas

Common Pitfalls

Factors vs Characters

# Factors can surprise you
df <- read.csv("file.csv")  # Strings become factors
df$name[1] <- "New Name"    # May fail!

# Solution: stringsAsFactors = FALSE
df <- read.csv("file.csv", stringsAsFactors = FALSE)

Vector Recycling

# Shorter vectors are recycled
c(1, 2, 3) + c(1, 2)        # Warning: length mismatch
# Result: 2 4 4 (1+1, 2+2, 3+1)

# Be explicit with rep()
c(1, 2, 3) + rep(c(1, 2), length.out = 3)

[ vs [[

# Single bracket returns same type
lst[1]              # Returns list
df[1]               # Returns data frame

# Double bracket extracts element
lst[[1]]            # Returns element
df[[1]]             # Returns vector

NA Comparisons

# NA comparisons return NA
x <- NA
x == NA             # NA (not TRUE!)

# Use is.na()
is.na(x)            # TRUE

Performance Tips

# Pre-allocate vectors
x <- numeric(1000)  # Good
for (i in 1:1000) x[i] <- i

# Growing vectors is slow
x <- c()            # Bad
for (i in 1:1000) x <- c(x, i)

# Use vectorized operations
sum(x)              # Fast
result <- 0         # Slow
for (i in x) result <- result + i

# Use apply instead of loops
apply(m, 1, sum)    # Fast
sums <- numeric(nrow(m))  # Slower
for (i in 1:nrow(m)) sums[i] <- sum(m[i,])

Also see