Getting started
Introduction
R is a language for statistical computing and data analysis. Widely used in data science, research, and visualization.
Installation
# Install R
brew install r # macOS
sudo apt install r-base # Ubuntu
# Install RStudio (IDE)
# Download from rstudio.com
Quick Example
# Load data
data <- read.csv("data.csv")
# Analyze
summary(data)
mean(data$value)
# Visualize
plot(data$x, data$y)
Basic Syntax
# Assignment
x <- 5 # Standard R style
x = 5 # Also works
5 -> x # Right assignment
# Comments
# This is a comment
# Print
print(x)
cat("Value:", x)
Data Types
Vectors
# Numeric vector
nums <- c(1, 2, 3, 4, 5)
# Integer vector
ints <- c(1L, 2L, 3L)
# Logical vector
bools <- c(TRUE, FALSE, TRUE)
# Character vector
chars <- c("a", "b", "c")
# Sequences
1:10 # 1 2 3 ... 10
seq(1, 10, 2) # 1 3 5 7 9
rep(1, 5) # 1 1 1 1 1
Lists
# Create list
lst <- list(
name = "John",
age = 30,
scores = c(85, 90, 95)
)
# Access elements
lst$name # "John"
lst[[1]] # "John"
lst[["age"]] # 30
# Named vs unnamed
list(a = 1, b = 2) # Named
list(1, 2, 3) # Unnamed
Data Frames
# Create data frame
df <- data.frame(
name = c("Alice", "Bob", "Carol"),
age = c(25, 30, 35),
score = c(85, 90, 95)
)
# Access columns
df$name # Column vector
df[["age"]] # Column vector
df[, "score"] # Column vector
# Access rows
df[1, ] # First row
df[1:2, ] # First two rows
# Dimensions
nrow(df) # Number of rows
ncol(df) # Number of columns
dim(df) # c(rows, cols)
Matrices
# Create matrix
m <- matrix(1:9, nrow = 3, ncol = 3)
# By row instead of column
m <- matrix(1:9, nrow = 3, byrow = TRUE)
# Access elements
m[1, 2] # Row 1, col 2
m[1, ] # Row 1
m[, 2] # Column 2
# Matrix operations
t(m) # Transpose
m %*% m # Matrix multiplication
Factors
# Create factor
gender <- factor(c("M", "F", "F", "M"))
# With levels
sizes <- factor(
c("S", "M", "L", "M"),
levels = c("S", "M", "L", "XL")
)
# Ordered factor
rating <- factor(
c("low", "high", "medium"),
levels = c("low", "medium", "high"),
ordered = TRUE
)
# Work with factors
levels(gender) # Get levels
relevel(gender, ref = "F") # Change reference
Subsetting
Vector Subsetting
x <- c(10, 20, 30, 40, 50)
# Positive indices
x[1] # 10
x[c(1, 3, 5)] # 10 30 50
x[1:3] # 10 20 30
# Negative indices (exclude)
x[-1] # 20 30 40 50
x[-c(1, 5)] # 20 30 40
# Logical subsetting
x[x > 25] # 30 40 50
x[x %% 20 == 0] # 20 40
# Named vectors
y <- c(a = 1, b = 2, c = 3)
y["b"] # 2
y[c("a", "c")] # 1 3
Data Frame Subsetting
df <- data.frame(
x = 1:5,
y = letters[1:5],
z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)
# Single bracket (returns data frame)
df[1:2, ] # First 2 rows
df[, 1:2] # First 2 columns
df[df$x > 2, ] # Rows where x > 2
# Double bracket (returns vector)
df[[1]] # First column as vector
# Dollar sign (returns vector)
df$x # x column as vector
# subset() function
subset(df, x > 2) # Rows where x > 2
subset(df, x > 2, select = c(x, y)) # Select columns
List Subsetting
lst <- list(
a = 1:3,
b = "text",
c = list(x = 10, y = 20)
)
# Single bracket (returns list)
lst[1] # list(a = 1:3)
lst["a"] # list(a = 1:3)
# Double bracket (returns element)
lst[[1]] # 1:3
lst[["a"]] # 1:3
# Dollar sign
lst$a # 1:3
# Nested lists
lst$c$x # 10
lst[[c(3, 1)]] # 10
Functions
Function Definition
# Basic function
square <- function(x) {
return(x^2)
}
# Implicit return (last expression)
square <- function(x) {
x^2
}
# Multiple arguments
power <- function(x, n) {
x^n
}
# Default arguments
power <- function(x, n = 2) {
x^n
}
Function Arguments
# Named arguments
power(x = 5, n = 3)
power(n = 3, x = 5) # Order doesn't matter
# Partial matching
power(x = 5, n = 3)
power(5, n = 3) # x matched by position
# ... (dot-dot-dot)
myprint <- function(...) {
args <- list(...)
print(args)
}
Anonymous Functions
# Traditional
sapply(1:5, function(x) x^2)
# R 4.1+ shorthand
sapply(1:5, \(x) x^2)
# With multiple arguments
mapply(\(x, y) x + y, 1:3, 4:6)
Control Flow
Conditionals
# if/else
if (x > 0) {
print("positive")
} else if (x < 0) {
print("negative")
} else {
print("zero")
}
# ifelse (vectorized)
ifelse(x > 0, "pos", "neg")
# switch
result <- switch(
type,
"a" = 1,
"b" = 2,
"c" = 3,
NA # default
)
Loops
# for loop
for (i in 1:10) {
print(i)
}
# for with vectors
fruits <- c("apple", "banana", "orange")
for (fruit in fruits) {
print(fruit)
}
# while loop
i <- 1
while (i <= 10) {
print(i)
i <- i + 1
}
# repeat loop
i <- 1
repeat {
print(i)
i <- i + 1
if (i > 10) break
}
# next (continue)
for (i in 1:10) {
if (i %% 2 == 0) next
print(i) # Only odd numbers
}
Apply Family
apply()
# Apply over matrix/array
m <- matrix(1:12, nrow = 3)
# Apply to rows (margin = 1)
apply(m, 1, sum) # Row sums
# Apply to columns (margin = 2)
apply(m, 2, mean) # Column means
# Custom function
apply(m, 1, function(x) max(x) - min(x))
lapply() & sapply()
# lapply returns list
nums <- list(a = 1:3, b = 4:6, c = 7:9)
lapply(nums, sum) # list(a = 6, b = 15, c = 24)
# sapply simplifies to vector/matrix
sapply(nums, sum) # c(a = 6, b = 15, c = 24)
# On vectors
sapply(1:5, function(x) x^2) # 1 4 9 16 25
# vapply (with explicit type)
vapply(nums, sum, numeric(1))
tapply()
# Apply by groups
scores <- c(85, 90, 75, 80, 95, 70)
groups <- c("A", "B", "A", "B", "A", "B")
tapply(scores, groups, mean)
# A B
# 83.3 80.0
# Multiple grouping factors
tapply(scores, list(groups, c(1,1,2,2,1,2)), mean)
Other Apply Functions
# mapply (multivariate)
mapply(function(x, y) x + y, 1:3, 4:6) # 5 7 9
# rapply (recursive for lists)
lst <- list(a = 1:3, b = list(c = 4:6))
rapply(lst, sum, how = "list")
dplyr Package
Basic Verbs
library(dplyr)
# filter() - subset rows
filter(df, age > 25)
filter(df, age > 25 & score < 90)
# select() - subset columns
select(df, name, age)
select(df, -score) # Exclude
select(df, starts_with("a"))
# mutate() - add/modify columns
mutate(df, age_double = age * 2)
mutate(df, age = age + 1) # Modify
# arrange() - sort rows
arrange(df, age) # Ascending
arrange(df, desc(age)) # Descending
# summarise() - aggregate
summarise(df, mean_age = mean(age))
Grouping & Piping
# group_by() + summarise()
df %>%
group_by(department) %>%
summarise(
mean_age = mean(age),
count = n()
)
# Pipe operator
df %>%
filter(age > 25) %>%
select(name, score) %>%
arrange(desc(score))
# Native pipe (R 4.1+)
df |>
filter(age > 25) |>
mutate(high_score = score > 90)
Other dplyr Functions
# count() - count unique values
count(df, department)
# distinct() - unique rows
distinct(df, department)
# slice() - select rows by position
slice(df, 1:5)
slice_head(df, n = 3)
slice_tail(df, n = 3)
slice_max(df, order_by = score, n = 5)
# rename() - rename columns
rename(df, full_name = name)
# relocate() - reorder columns
relocate(df, score, .before = age)
Data Import/Export
CSV Files
# Read CSV
df <- read.csv("data.csv")
df <- read.csv("data.csv", header = TRUE)
df <- read.csv("data.csv", stringsAsFactors = FALSE)
# Write CSV
write.csv(df, "output.csv")
write.csv(df, "output.csv", row.names = FALSE)
# readr package (faster, better defaults)
library(readr)
df <- read_csv("data.csv")
write_csv(df, "output.csv")
R Data Files
# Save single object
saveRDS(df, "data.rds")
df <- readRDS("data.rds")
# Save multiple objects
save(df1, df2, file = "data.RData")
load("data.RData") # Objects restored with original names
# Save workspace
save.image("workspace.RData")
load("workspace.RData")
Other Formats
# Excel (readxl package)
library(readxl)
df <- read_excel("data.xlsx")
df <- read_excel("data.xlsx", sheet = "Sheet1")
# JSON (jsonlite package)
library(jsonlite)
data <- fromJSON("data.json")
write_json(df, "output.json")
# Database (DBI package)
library(DBI)
con <- dbConnect(RSQLite::SQLite(), "database.db")
df <- dbReadTable(con, "table_name")
dbDisconnect(con)
Data Manipulation
Base R Functions
# subset()
subset(df, age > 25)
subset(df, age > 25 & score < 90)
subset(df, age > 25, select = c(name, score))
# merge() - join data frames
df1 <- data.frame(id = 1:3, name = c("A", "B", "C"))
df2 <- data.frame(id = 2:4, score = c(80, 90, 95))
merge(df1, df2, by = "id") # Inner join
merge(df1, df2, by = "id", all = TRUE) # Outer join
merge(df1, df2, by = "id", all.x = TRUE) # Left join
# aggregate()
aggregate(score ~ department, data = df, mean)
aggregate(cbind(age, score) ~ department, data = df, mean)
Data Reshaping
# reshape2 package
library(reshape2)
# Wide to long
df_long <- melt(df, id.vars = "id")
# Long to wide
df_wide <- dcast(df_long, id ~ variable)
# tidyr package
library(tidyr)
# pivot_longer (wide to long)
df_long <- pivot_longer(df, cols = -id)
# pivot_wider (long to wide)
df_wide <- pivot_wider(df_long, names_from = name, values_from = value)
Combining Data
# rbind() - combine rows
df3 <- rbind(df1, df2)
# cbind() - combine columns
df3 <- cbind(df1, df2)
# bind_rows() (dplyr)
bind_rows(df1, df2)
# bind_cols() (dplyr)
bind_cols(df1, df2)
Statistics
Descriptive Statistics
# Central tendency
mean(x) # Mean
median(x) # Median
mode(x) # Mode (not built-in, use table)
# Variability
sd(x) # Standard deviation
var(x) # Variance
IQR(x) # Interquartile range
range(x) # Min and max
diff(range(x)) # Range value
# Summary
summary(x) # Five-number summary + mean
quantile(x) # Quartiles
quantile(x, probs = c(0.25, 0.75))
Statistical Tests
# t-test
t.test(x) # One-sample
t.test(x, y) # Two-sample
t.test(x ~ group, data = df) # Formula notation
# Chi-square test
chisq.test(table(x, y))
# Correlation
cor(x, y) # Pearson
cor(x, y, method = "spearman")
cor.test(x, y) # With significance
# Linear regression
model <- lm(y ~ x, data = df)
summary(model)
predict(model, newdata)
Distributions
# Normal distribution
rnorm(100) # Random samples
dnorm(x) # Density
pnorm(x) # Cumulative probability
qnorm(p) # Quantile
# Other distributions
runif(100) # Uniform
rpois(100, lambda = 5) # Poisson
rbinom(100, size = 10, prob = 0.5) # Binomial
Plotting
Base Graphics
# Scatter plot
plot(x, y)
plot(x, y, main = "Title", xlab = "X", ylab = "Y")
plot(x, y, col = "blue", pch = 19)
# Line plot
plot(x, y, type = "l")
plot(x, y, type = "b") # Both points and lines
# Histogram
hist(x)
hist(x, breaks = 20, col = "lightblue")
# Boxplot
boxplot(x)
boxplot(score ~ department, data = df)
# Barplot
barplot(table(x))
barplot(counts, names.arg = labels)
ggplot2 Basics
library(ggplot2)
# Basic template
ggplot(df, aes(x = x, y = y)) +
geom_point()
# Scatter plot
ggplot(df, aes(x = age, y = score)) +
geom_point(aes(color = department))
# Line plot
ggplot(df, aes(x = time, y = value)) +
geom_line()
# Bar plot
ggplot(df, aes(x = category)) +
geom_bar()
# Histogram
ggplot(df, aes(x = value)) +
geom_histogram(bins = 30)
# Boxplot
ggplot(df, aes(x = group, y = value)) +
geom_boxplot()
ggplot2 Customization
# Faceting
ggplot(df, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ category)
# Themes
ggplot(df, aes(x = x, y = y)) +
geom_point() +
theme_minimal()
# Labels
ggplot(df, aes(x = x, y = y)) +
geom_point() +
labs(
title = "Title",
x = "X Label",
y = "Y Label"
)
# Scales
ggplot(df, aes(x = x, y = y, color = z)) +
geom_point() +
scale_color_gradient(low = "blue", high = "red")
String Manipulation
Basic String Operations
# Concatenate
paste("Hello", "World") # "Hello World"
paste0("Hello", "World") # "HelloWorld"
paste(c("a", "b"), collapse = ",") # "a,b"
# Substring
substr("Hello World", 1, 5) # "Hello"
substring("Hello World", 7) # "World"
# String length
nchar("Hello") # 5
# Case conversion
toupper("hello") # "HELLO"
tolower("HELLO") # "hello"
Pattern Matching
# grep() - find matches
grep("pattern", c("text", "pattern", "other")) # 2
grep("pattern", strings, value = TRUE) # Return values
# grepl() - logical vector
grepl("pattern", strings) # TRUE/FALSE vector
# sub() & gsub() - replace
sub("old", "new", "old text old") # Replace first
gsub("old", "new", "old text old") # Replace all
# Regular expressions
gsub("[0-9]", "", "abc123") # "abc"
gsub("\\s+", " ", "a b c") # "a b c"
stringr Package
library(stringr)
# Detect
str_detect(strings, "pattern")
# Extract
str_extract(strings, "pattern")
str_extract_all(strings, "pattern")
# Replace
str_replace(strings, "old", "new")
str_replace_all(strings, "old", "new")
# Split
str_split(strings, ",")
# Trim whitespace
str_trim(" hello ") # "hello"
Package Management
Installing Packages
# Install from CRAN
install.packages("dplyr")
install.packages(c("dplyr", "ggplot2"))
# Install from GitHub
install.packages("devtools")
devtools::install_github("user/repo")
# Install from Bioconductor
if (!requireNamespace("BiocManager"))
install.packages("BiocManager")
BiocManager::install("package")
Loading Packages
# library() - load package
library(dplyr)
# require() - load with check
if (require(dplyr)) {
# dplyr is available
}
# Namespace access (no loading)
dplyr::filter(df, age > 25)
# Unload package
detach("package:dplyr", unload = TRUE)
Managing Packages
# List installed packages
installed.packages()
# Update packages
update.packages()
# Remove package
remove.packages("dplyr")
# Check if installed
"dplyr" %in% installed.packages()
Operators
Arithmetic Operators
x + y # Addition
x - y # Subtraction
x * y # Multiplication
x / y # Division
x ^ y # Exponentiation
x %% y # Modulo
x %/% y # Integer division
Comparison Operators
x == y # Equal
x != y # Not equal
x > y # Greater than
x < y # Less than
x >= y # Greater or equal
x <= y # Less or equal
Logical Operators
x & y # Element-wise AND
x | y # Element-wise OR
!x # NOT
x && y # Scalar AND
x || y # Scalar OR
xor(x, y) # Exclusive OR
Special Operators
x %in% y # Match
x %*% y # Matrix multiplication
x %>% f() # Pipe (dplyr)
x |> f() # Native pipe (R 4.1+)
Working Directory & Files
Directory Operations
# Get working directory
getwd()
# Set working directory
setwd("/path/to/directory")
# List files
list.files()
list.files(pattern = "\\.csv$")
dir() # Alias for list.files()
# File info
file.exists("file.txt")
file.info("file.txt")
File Operations
# Create directory
dir.create("new_folder")
# Copy file
file.copy("from.txt", "to.txt")
# Rename/move file
file.rename("old.txt", "new.txt")
# Delete file
file.remove("file.txt")
# Path operations
file.path("folder", "file.txt") # OS-independent paths
basename("/path/to/file.txt") # "file.txt"
dirname("/path/to/file.txt") # "/path/to"
Gotchas
Common Pitfalls
Factors vs Characters
# Factors can surprise you
df <- read.csv("file.csv") # Strings become factors
df$name[1] <- "New Name" # May fail!
# Solution: stringsAsFactors = FALSE
df <- read.csv("file.csv", stringsAsFactors = FALSE)
Vector Recycling
# Shorter vectors are recycled
c(1, 2, 3) + c(1, 2) # Warning: length mismatch
# Result: 2 4 4 (1+1, 2+2, 3+1)
# Be explicit with rep()
c(1, 2, 3) + rep(c(1, 2), length.out = 3)
[ vs [[
# Single bracket returns same type
lst[1] # Returns list
df[1] # Returns data frame
# Double bracket extracts element
lst[[1]] # Returns element
df[[1]] # Returns vector
NA Comparisons
# NA comparisons return NA
x <- NA
x == NA # NA (not TRUE!)
# Use is.na()
is.na(x) # TRUE
Performance Tips
# Pre-allocate vectors
x <- numeric(1000) # Good
for (i in 1:1000) x[i] <- i
# Growing vectors is slow
x <- c() # Bad
for (i in 1:1000) x <- c(x, i)
# Use vectorized operations
sum(x) # Fast
result <- 0 # Slow
for (i in x) result <- result + i
# Use apply instead of loops
apply(m, 1, sum) # Fast
sums <- numeric(nrow(m)) # Slower
for (i in 1:nrow(m)) sums[i] <- sum(m[i,])
Also see
- R Project - Official R website
- CRAN - Comprehensive R Archive Network
- RStudio - Popular R IDE
- dplyr Documentation - Data manipulation grammar
- ggplot2 Documentation - Grammar of graphics
- R for Data Science - Free online book by Hadley Wickham
- GitHub Issue #923 - Original request