getwd()
setwd('~/Desktop/rnaseq-workshop-2017')
install.packages('ggplot2')
library('ggplot2')
require('ggplot2')
remove.packages('ggplot2')
## restart R
library(ggplot2)
require('ggplot2')
install.packages('data/ggplot2_2.2.1.tar.gz', repos = NULL)
require('ggplot2')
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
require('DESeq2')
packageVersion('DESeq2')
remove.packages('DESeq2')
# restart R
require('DESeq2')
install.packages('devtools')
require('devtools')
install_github('Bioconductor-mirror/DESeq2')
install_github('Bioconductor-mirror/DESeq2', ref = 'release-3.3')
packageVersion('DESeq2')
## read this page if this doesn't work for you:
## https://stackoverflow.com/questions/29992066/rcpp-warning-directory-not-found-for-option-l-usr-local-cellar-gfortran-4-8/29993906#29993906
RScript install-r-packages.R
pkgs = c('plyr', 'dplyr')
install.packages(pkgs, repos = "http://cran.us.r-project.org")
?
function.help()
function: help(package='DESeq2')
data(package='base')
data(package='ggplot2')
data('mpg', package='ggplot2')
mpg
Two most common data loading functions: read.csv()
and read.table()
.
?read.csv
read.csv('data/data/PBSC-comma.txt')
read.csv('data/PBSC-tab.txt', sep = '\t')
read.csv('data/PBSC-comma.txt', header=FALSE)
read.table('data/PBSC-tab.txt', header=TRUE) # does the header argument work?
read.table('data/PBSC-tab.txt', header=TRUE, comment.char = '')
# an example use case: load gtf file
read.table('Arabidopsis_thaliana.h20.gtf', header=FALSE, sep='\t',
comment.char = '#')
x = 1:10
x
my_data = 1:10
my_data
my_data_2 = 1:5
my_data_2
assign(x = 'my_data', value = 3:8)
my_data
# some variable names that you should probably never use.
assign(x = '2017', value = 1:10)
'2017'
get('2017')
assign(x = '#', value = 'this is my value')
get('#')
pbsc = read.csv('data/PBSC-comma.txt', header = TRUE)
pbsc
is.data.frame(pbsc)
str(pbsc)
x = 1:10
x
is.vector(x)
# get a column from a data frame
pbsc$G
pbsc[, 1]
# a column from a data frame is an atomic vectors
is.vector(pbsc$G)
is.data.frame(pbsc$G)
# get a row from a data frame
pbsc[1, ]
# a row from a data frame is still a data frame.
is.vector(pbsc[1, ])
is.data.frame(pbsc[1, ])
pbsc[1,]$G
my_vector = c(1, 3, 5) # c() function combine values into a vector.
my_vector
my_v2 = c('a', 'b', 'c')
my_v2
my_v3 = c(1, 2, 'b') # automatically convert numbers to strings.
my_v3
my_list = list(a=1:3, b=c('a', 'b', 'd'))
my_list
my_list$a
# any types of data can be an element of a list.
my_l2 = list(a=1:3, b=c('a', 'b', 'd'), df=pbsc)
my_l2
my_l2$df
my_l2[1]
my_matrix = matrix(1:12, nrow=3, ncol=4)
my_matrix
my_matrix_2 = matrix(letters[1:12], nrow=3, ncol=4)
my_matrix_2
# convert a heterogeneous data frame to a homogeneous matrix
my_m3 = as.matrix(pbsc)
my_m3
# a column from a matrix is an vector
my_m3[, 3]
is.vector(my_m3[, 3])
# a row from a matrix is still an vector
my_m3[1, ]
is.vector(my_m3[1, ])
There are three commonly used subsetting operators: [[
, $
, and [
$
is a shorthand for [[
. It is useful when values are bound to character names.$
is invalid for atomic vectors.[
is similar to [[
. The difference is that [[
can only return a single value, while [[
can return multiple values.pbsc
is.data.frame(pbsc)
is.list(pbsc)
my_v1 = letters[1:10]
my_v1
# access single value
my_v1[1]
my_v1[[2]]
# access multiple values
my_v1[1:5]
my_v1[c(1,5,9)]
my_v1[c(1,3,3,3,1:10)] # get some elements repeatedly
my_v1[[1:5]] # this won't work
# access elements by names
names(my_v1) = LETTERS[1:10]
my_v1
my_v1['A']
my_v1[c('A','C','B')]
my_v1$A # this won't work!
# A use case for accessing elements by names.
my_sequence = sample(c('g', 'c', 't', 'a'), 20, replace = TRUE)
my_sequence
# how to convert lower case bases to upper case bases?
my_template = c('G','C','T','A')
names(my_template) = c('g', 'c', 't', 'a')
my_template
my_template[my_sequence] # yeah!!
# you will see how useful this is when you use the ggplot2 package for data visualization.
# acess a single cell from data frame.
pbsc[1,2]
pbsc[[1,2]] # this will work since we are accessing single value.
# access multiple values
pbsc[1:5, ] # rows from 1 to 5
pbsc[, 1:2] # columns from 1 to 2
pbsc[1:5, 1:2] # subsetting by rows and columns
pbsc[[1:5, ]] # this won't work
# get one column from the data frame and remain the data structure.
pbsc[, 3]
is.vector(pbsc[, 3])
pbsc[, 3, drop=FALSE]
is.vector(pbsc[, 3, drop=FALSE])
is.data.frame(pbsc[, 3, drop=FALSE])
# elements in the index don't have to be unique.
pbsc[, c(1:4, 4, 4)]
nrow(pbsc)
pbsc[c(1:2, 2, 2, 3:55), ]
# use $
pbsc$A
pbsc$X.Base
my_l2 = list(a=1:3, b=c('a', 'b', 'd'), df=pbsc)
my_l2
# use $
my_l2$a
my_l2$df
# use index to access multiple elements.
my_l2[1:3]
my_l2[c(1,1,1,1,2,2,3)]
my_l2[[1:3]] # this won't work!
When [
is applied to a list, it always returns a list. To access elements from an element within a list, we need the [[
operator.
my_l2[1]
is.list(my_l2[1])
# how to get the second value from element 'a' in list 'my_l2'
my_l2[1][2] # this won't work!!
my_l2[1][1][1][1][1][1][1][1][1][1][1][1][1][1][1][1]
my_l2[[1]]
is.list(my_l2[[1]])
is.vector(my_l2[[1]])
my_l2[[1]][2] # this will return the second value from element 'a' in list 'my_l2'
my_l2[[3]]
my_l2[[3]]$G
pbsc_matrix = as.matrix(pbsc)
pbsc_matrix
# access columns
pbsc_matrix[, 2]
pbsc_matrix[, 2:3]
# access rows
pbsc_matrix[1:2, ]
# '$' operator does not apply to matrix.
pbsc_matrix$G # this won't work!!
# get single column or rows and remain the data structure.
pbsc_matrix[1, ]
pbsc_matrix[1, ,drop=FALSE]
pbsc_matrix[ , 2]
pbsc_matrix[ , 2,drop=FALSE]
When you import data from a file, R treat string columns as factors by default.
pbsc = read.csv('data/PBSC-comma.txt', header = TRUE)
pbsc$X.Base
# the stringsAsFactors argument
pbsc_str = read.csv('data/PBSC-comma.txt', header = TRUE, stringsAsFactors = FALSE)
pbsc_str$X.Base
Factors are used to store categorical data and are built on top of integer vectors.
You need two components to define a factor:
# a set of categories: c('a', 'b', 'c')
# a vector: c('b', 'c', 'b', 'a', 'b')
factor_x1 = factor(levels = c('a', 'b', 'c'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x1
# the vector does not have to contain all categories
factor_x2 = factor(levels = c('a', 'b', 'c'), x = c('b', 'b', 'b', 'b', 'b'))
factor_x2
If you don’t provide the category set, it will use the unique values from the vector as the category set
factor_x3 = factor(x = c('a', 'c', 'e', 'e', 'f', 'a'))
factor_x3
How to understand that factors are built on top of integer vectors?
factor_x1
as.integer(factor_x1)
factor_x3
as.integer(factor_x3)
# compare factors with string vectors
str_v1 = c('a', 'c', 'e', 'e', 'f', 'a')
as.integer(str_v1)
str_v2 = c('1', '3', '4', '1')
as.integer(str_v2)
How are factor elements mapped to integers? * The factor elements are mapped to an integer sequence that starts from 1, and the mapping order is determined by the order of factor levels.
factor_x1 = factor(levels = c('a', 'b', 'c'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x1
as.integer(factor_x1)
factor_x4 = factor(levels = c('c', 'b', 'a'), x = c('b', 'c', 'b', 'a', 'b'))
factor_x4
as.integer(factor_x4)
as.integer(factor_x1)
Different level orders can result in different data analysis results due to the change in reference order. Next time when you do a linear regression analysis, try to reset the level orders for your categorical variables and see that happens.
write.table()
write.csv()
sink()
mtcars
write.table(x = mtcars, file = 'mtcars.txt', col.names = TRUE, row.names = TRUE)
write.csv(x = mtcars, file = 'mtcars.csv', col.names = TRUE) # generates a message
# let's open the help document for write.csv
?write.csv
write.csv(x = mtcars, file = 'mtcars.csv')
# recomend using write.table
# of course you can save the data into a csv file
write.table(x = mtcars, file = 'mtcars.csv', col.names = TRUE, row.names = TRUE)
write.table(x = mtcars, file = 'mtcars_2.csv', col.names = FALSE, row.names = FALSE)
sink()
diverts R output to a connection.
# open a connection by providing a file path
sink(file = 'my_linear_regression_analysis.txt')
# all outputs that usually go to your screen will be saved into the file.
lm(mpg~., data = mtcars)
summary(lm(mpg~., data = mtcars))
# close the connection
sink()