Content

Set working directory

getwd()
setwd('~/Desktop/rnaseq-workshop-2017')

Install packages

install.packages('ggplot2')
library('ggplot2')
require('ggplot2')
remove.packages('ggplot2')
## restart R
library(ggplot2)
require('ggplot2')
install.packages('data/ggplot2_2.2.1.tar.gz', repos = NULL)
require('ggplot2')
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
require('DESeq2')
packageVersion('DESeq2')
remove.packages('DESeq2')
# restart R
require('DESeq2')

install.packages('devtools')
require('devtools')
install_github('Bioconductor-mirror/DESeq2')
install_github('Bioconductor-mirror/DESeq2', ref = 'release-3.3')
packageVersion('DESeq2')
## read this page if this doesn't work for you:
## https://stackoverflow.com/questions/29992066/rcpp-warning-directory-not-found-for-option-l-usr-local-cellar-gfortran-4-8/29993906#29993906
pkgs = c('plyr', 'dplyr')
install.packages(pkgs, repos = "http://cran.us.r-project.org")

Get some help

Data and data structures

Bbuilt-in datasets

data(package='base')
data(package='ggplot2')
data('mpg', package='ggplot2')
mpg

loading data

Two most common data loading functions: read.csv() and read.table().

?read.csv
read.csv('data/data/PBSC-comma.txt')
read.csv('data/PBSC-tab.txt', sep = '\t')
read.csv('data/PBSC-comma.txt', header=FALSE)
read.table('data/PBSC-tab.txt', header=TRUE) # does the header argument work?
read.table('data/PBSC-tab.txt', header=TRUE, comment.char = '')

# an example use case: load gtf file
read.table('Arabidopsis_thaliana.h20.gtf', header=FALSE, sep='\t', 
         comment.char = '#')

variables

  • Restrictions on variable names.
    • Technically amostly no restrictions on variable names
    • But Most R programmers
      • starts with lower case letters
      • separate words in names with underscores
      • only use lower case letters, numbers and underscores
x = 1:10
x
my_data = 1:10
my_data
my_data_2 = 1:5
my_data_2

assign(x = 'my_data', value = 3:8)
my_data

# some variable names that you should probably never use.
assign(x = '2017', value = 1:10)
'2017'
get('2017')

assign(x = '#', value = 'this is my value')
get('#')
  • Assign the imported data to a variable.
pbsc = read.csv('data/PBSC-comma.txt', header = TRUE)
pbsc

data structures

  • Data frame
    • the most common data structure in R
    • has two dimensions
is.data.frame(pbsc)

str(pbsc)
  • Vectors
    • very basic data structure in R
    • two flavors: atomic vectors and lists.
x = 1:10
x
is.vector(x)

# get a column from a data frame
pbsc$G
pbsc[, 1]
# a column from a data frame is an atomic vectors
is.vector(pbsc$G)
is.data.frame(pbsc$G)

# get a row from a data frame
pbsc[1, ]
# a row from a data frame is still a data frame.
is.vector(pbsc[1, ])
is.data.frame(pbsc[1, ])

pbsc[1,]$G
  • atomic vectors vs. lists
    • atomic vectors are homogeneous. They can only have one type of values.
    • lists are homogeneous or heterogenous.
my_vector = c(1, 3, 5) # c() function combine values into a vector.
my_vector

my_v2 = c('a', 'b', 'c')
my_v2

my_v3 = c(1, 2, 'b') # automatically convert numbers to strings.
my_v3

my_list = list(a=1:3, b=c('a', 'b', 'd'))
my_list
my_list$a

# any types of data can be an element of a list.
my_l2 = list(a=1:3, b=c('a', 'b', 'd'), df=pbsc)
my_l2
my_l2$df
my_l2[1]
  • matrix
    • two dimentional
    • homogeneous
my_matrix = matrix(1:12, nrow=3, ncol=4)
my_matrix

my_matrix_2 = matrix(letters[1:12], nrow=3, ncol=4)
my_matrix_2

# convert a heterogeneous data frame to a homogeneous matrix
my_m3 = as.matrix(pbsc)
my_m3

# a column from a matrix is an vector
my_m3[, 3]
is.vector(my_m3[, 3])

# a row from a matrix is still an vector
my_m3[1, ]
is.vector(my_m3[1, ])

Subsetting operators

There are three commonly used subsetting operators: [[, $, and [

  • $ is a shorthand for [[. It is useful when values are bound to character names.
  • $ is invalid for atomic vectors.
  • [ is similar to [[. The difference is that [[ can only return a single value, while [[ can return multiple values.
pbsc
  • A data frame is also a list.
    • the columns in the data frame are elements in the list.
is.data.frame(pbsc)
is.list(pbsc)

subsetting atomic vectors

my_v1 = letters[1:10]
my_v1

# access single value
my_v1[1]
my_v1[[2]]

# access multiple values
my_v1[1:5]
my_v1[c(1,5,9)]
my_v1[c(1,3,3,3,1:10)] # get some elements repeatedly

my_v1[[1:5]] # this won't work

# access elements by names
names(my_v1) = LETTERS[1:10]
my_v1
my_v1['A']
my_v1[c('A','C','B')]

my_v1$A # this won't work!
# A use case for accessing elements by names.

my_sequence = sample(c('g', 'c', 't', 'a'), 20, replace = TRUE)
my_sequence

# how to convert lower case bases to upper case bases?
my_template = c('G','C','T','A')
names(my_template) = c('g', 'c', 't', 'a')
my_template

my_template[my_sequence] # yeah!!
# you will see how useful this is when you use the ggplot2 package for data visualization.

subsetting data frame

# acess a single cell from data frame.
pbsc[1,2]
pbsc[[1,2]] # this will work since we are accessing single value.

# access multiple values
pbsc[1:5, ] # rows from 1 to 5
pbsc[, 1:2] # columns from 1 to 2
pbsc[1:5, 1:2] # subsetting by rows and columns

pbsc[[1:5, ]] # this won't work

# get one column from the data frame and remain the data structure.
pbsc[, 3]
is.vector(pbsc[, 3])

pbsc[, 3, drop=FALSE]
is.vector(pbsc[, 3, drop=FALSE])
is.data.frame(pbsc[, 3, drop=FALSE])

# elements in the index don't have to be unique.
pbsc[, c(1:4, 4, 4)]
nrow(pbsc)
pbsc[c(1:2, 2, 2, 3:55), ]

# use $
pbsc$A
pbsc$X.Base

subsetting list

my_l2 = list(a=1:3, b=c('a', 'b', 'd'), df=pbsc)
my_l2

# use $
my_l2$a
my_l2$df
# use index to access multiple elements.
my_l2[1:3]
my_l2[c(1,1,1,1,2,2,3)]

my_l2[[1:3]] # this won't work!

When [ is applied to a list, it always returns a list. To access elements from an element within a list, we need the [[ operator.

my_l2[1]
is.list(my_l2[1])

# how to get the second value from element 'a' in list 'my_l2'
my_l2[1][2] # this won't work!!
my_l2[1][1][1][1][1][1][1][1][1][1][1][1][1][1][1][1]

my_l2[[1]]
is.list(my_l2[[1]])
is.vector(my_l2[[1]])

my_l2[[1]][2] # this will return the second value from element 'a' in list 'my_l2'
my_l2[[3]]
my_l2[[3]]$G

subsetting matrix

pbsc_matrix = as.matrix(pbsc)
pbsc_matrix

# access columns
pbsc_matrix[, 2]
pbsc_matrix[, 2:3]

# access rows
pbsc_matrix[1:2, ]

# '$' operator does not apply to matrix.
pbsc_matrix$G # this won't work!!

# get single column or rows and remain the data structure.
pbsc_matrix[1, ]
pbsc_matrix[1, ,drop=FALSE]

pbsc_matrix[ , 2]
pbsc_matrix[ , 2,drop=FALSE]

Factors

When you import data from a file, R treat string columns as factors by default.

pbsc = read.csv('data/PBSC-comma.txt', header = TRUE)
pbsc$X.Base

# the stringsAsFactors argument
pbsc_str = read.csv('data/PBSC-comma.txt', header = TRUE, stringsAsFactors = FALSE)
pbsc_str$X.Base

Factors are used to store categorical data and are built on top of integer vectors.

You need two components to define a factor:

  • A set of predefined categories (or levels)
  • A vector that contain values only from those categories.
# a set of categories: c('a', 'b', 'c')
# a vector: c('b', 'c', 'b', 'a', 'b')
factor_x1 = factor(levels = c('a', 'b', 'c'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x1

# the vector does not have to contain all categories
factor_x2 = factor(levels = c('a', 'b', 'c'), x = c('b', 'b', 'b', 'b', 'b'))
factor_x2

If you don’t provide the category set, it will use the unique values from the vector as the category set

factor_x3 = factor(x = c('a', 'c', 'e', 'e', 'f', 'a'))
factor_x3

How to understand that factors are built on top of integer vectors?

factor_x1
as.integer(factor_x1)

factor_x3
as.integer(factor_x3)

# compare factors with string vectors
str_v1 = c('a', 'c', 'e', 'e', 'f', 'a')
as.integer(str_v1) 

str_v2 = c('1', '3', '4', '1') 
as.integer(str_v2)

How are factor elements mapped to integers? * The factor elements are mapped to an integer sequence that starts from 1, and the mapping order is determined by the order of factor levels.

factor_x1 = factor(levels = c('a', 'b', 'c'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x1
as.integer(factor_x1)

factor_x4 = factor(levels = c('c', 'b', 'a'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x4
as.integer(factor_x4)
as.integer(factor_x1)

Different level orders can result in different data analysis results due to the change in reference order. Next time when you do a linear regression analysis, try to reset the level orders for your categorical variables and see that happens.

Write data into files

  • write.table()
  • write.csv()
  • sink()
mtcars

write.table(x = mtcars, file = 'mtcars.txt', col.names = TRUE, row.names = TRUE)
write.csv(x = mtcars, file = 'mtcars.csv', col.names = TRUE) # generates a message
# let's open the help document for write.csv
?write.csv

write.csv(x = mtcars, file = 'mtcars.csv')

# recomend using write.table
# of course you can save the data into a csv file
write.table(x = mtcars, file = 'mtcars.csv', col.names = TRUE, row.names = TRUE)
write.table(x = mtcars, file = 'mtcars_2.csv', col.names = FALSE, row.names = FALSE)

sink() diverts R output to a connection.

# open a connection by providing a file path
sink(file = 'my_linear_regression_analysis.txt')

# all outputs that usually go to your screen will be saved into the file.
lm(mpg~., data = mtcars) 
summary(lm(mpg~., data = mtcars))

# close the connection
sink()
Copyright © 2017 Ming Chen. All rights reserved.