This tutorial is a teaching material that I created for a workshop(https://github.com/MingChen0919/KBS_workshop_Michigan_2016/blob/master/fun_with_R.md).
SRR2426363.615 163 gi|254160123|ref|NC_012967.1| 3356120 10 71M1D8M1I125M23S = 3356375 504 CTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCGGGTAACGTCAATTGCTGCGGTTATTAGCCACAACACCTTCCTCCCCGCTGAAAGTACTTTACAACCCGAAGGCCTTCTTCATACACGCGGCATGGCTGCATCAGGCTTGCGCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTCGTAGTCAGTACTGTGTCTGAGT AAABBFF43AAAEGFGGGGGFGGFCGGFHHHGHHGGGG?GHHHHC?FC?G5GHGGGGHGF/EEHH?GGGCFHHHHGFGGGG?GBFGEF3?C?<GE?BGGHHHH/E?CCD/DGFDGHHGHBGHGFD.<A@C?.C<<0DGFFFHFFGEGF?D-AFGGGEEGGFF9FB?BBEA?D?.//9/9BFBFFFFFF/;BAFEFBBF.;9.9.A.;-99AB/;/9;/:99BBB//:9 XT:A:M NM:i:13 SM:i:10 AM:i:10 XM:i:11 XO:i:2 XG:i:2 MD:Z:71^G0A2A0A0A7A1T0T0T1C0T0C111
SRR2426363.698 89 gi|254160123|ref|NC_012967.1| 4015323 0 168M1I33M = 4015323 0 TGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAACGAGCAGTAAAACCTCTACAGGCTTGTAG EA;0C00GFC:00C:/DGGCFD0D0DDC?GGCC.<<--/@@??1?11?1GCGDCDHFDFB@@1B<?@FB12DCFAEF>F11<>1DHE>>AEHFFGF1EF0EEE/>/B1B1HG1E>EFBFFFGFGFHHDGBEFAA/2GFGAEGDF1GG1GFB2ABGF1HGFCB1F2GGF0F0GEEAEDFG1DF1B1FABF3FFFAAC1?AAAA XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:172T28 XA:Z:gi|254160123|ref|NC_012967.1|,-228028,168M1I33M,2;gi|254160123|ref|NC_012967.1|,+3355048,28M1I173M,2;
SRR2426363.1034 163 gi|254160123|ref|NC_012967.1| 3912080 29 39M1I137M = 3912459 618 CTCTTTAGGTATTCCTTCGAACAAGATGCAAGAATAGACAAAAATGACAGCCCTTCTACGAGTGATTAGCCTGGTCGTGATTAGCGTGGTGGTGATTATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAAATCAAGCCTTAACGAACTAAGACCCCCGCACCG ABBBBFFBFFFFGGGGGGGCGGGHGHFFFFCHHGHHHHFHFFAFGHFFFGEAEHHHHHGFDEEHHHHFHHHGFGGHFEGHHHHGF0B????EEFHHGGGHHFGHHHHGGHHEE?EC?DGFBGFHHEFFCGG//CGC0CGEB11=1>GG1>GGBGFB0DC.-.F0;CFHEGCCGCA@9 XT:A:U NM:i:4 SM:i:29 AM:i:29 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:9C24A107G33
Qname start_pos insert_1 insert_2 insert_3 insert_4
1 SRR2426363.615 3356120 3356201 NA NA NA
2 SRR2426363.698 4015323 4015492 NA NA NA
3 SRR2426363.1034 3912080 3912120 NA NA NA
4 SRR2426363.4217 4382662 4382826 4382838 NA NA
5 SRR2426363.4732 979022 979071 NA NA NA
6 SRR2426363.6348 4017239 4017395 NA NA NA
#--- **ply(): a*ply, l*ply, d*ply
#--- select(): focus on a subset of variables
#--- filter(): focus on a subset of rows
#--- mutate(): add new columns
#--- arrange(): re-order the rows
#--- colwise(): functional programming example
#--- [[: subsetting
#--- ->: right assignment operator
#--- ->>: assign through parents environments
#--- %>%
#--- workflow
p = c('plyr', 'dplyr', 'stringr')
install.packages(p)
library(plyr)
library(dplyr)
library(stringr)
rm(list=ls())
myData = readLines('https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam')
curl -O https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam
myData = readLines('SRR2426363.mapped.sam')
myData %>% tail(-1) %>% ## remove the first row because it contains field names
(function(x){
## wrap the str_split_fixed() function so it returns a character
str_split_fixed_2 = function(string, pattern, n){
x = str_split_fixed(string, pattern, n)
return(as.character(x))
}
ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
}) %>%
(function(x){
colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
rownames(x) = paste0('seq', 1:nrow(x))
return(x)
}) %>%
mutate(FLAG = as.numeric(FLAG),
POS = as.numeric(POS),
MAPQ = as.numeric(MAPQ),
PNEXT = as.numeric(PNEXT),
TLEN = as.numeric(TLEN)) %>%
(function(x){
filter(x, MAPQ > 30 & MAPQ < 55) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow()
return(x)
}) %>%
(function(x){
arrange(x, MAPQ) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head()
return(x)
}) %>%
(function(x){
filter(x, MAPQ > 30) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%
arrange(POS, MAPQ) %>% head()
return(x)
}) %>%
(function(x){
select(x, CIGAR) %>%
laply(grep, pattern='[I]') %>%
slice(.data = x) %>%
(function(x){
x ->> df_with_Insertion
select(x, CIGAR) ->> CIGAR_with_Insertion
select(x, QNAME) ->> QNAME
select(x, POS) ->> POS
return(select(x, CIGAR))
})
}) %>%
llply(str_split, 'I') %>%
`[[`('CIGAR') %>%
llply(head, -1) %>%
llply(gsub, pattern='[A-Z]', replacement='+') %>%
(function(x){
eval_string = function(var1) eval(parse(text=var1)) ## define a function
eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
llply(x, eval_string_plus)
}) %>%
llply(cumsum) %>%
(function(x){
maxN = max(lengths(x))
newX = vector('numeric', length = maxN)
ldply(x, `[`, 1:maxN)
}) %>%
(function(x){
`colnames<-`(x, paste0('insert_', 1:ncol(x)))
## return(x)
}) %>%
colwise(.fun=`+`)(unlist(POS)) %>%
mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
`[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))
##==== Step 1: split each line by tab and return a tabular data structure ====
myData %>% tail(-1) %>% ## remove the first row because it contains field names
(function(x){
## wrap the str_split_fixed() function so it returns a character
str_split_fixed_2 = function(string, pattern, n){
x = str_split_fixed(string, pattern, n)
return(as.character(x))
}
ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
}) %>%
##==== Step 2: set column names =======
(function(x){
colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
rownames(x) = paste0('seq', 1:nrow(x))
return(x)
}) %>%
##==== Step:3 convert strings to numbers ====
mutate(FLAG = as.numeric(FLAG),
POS = as.numeric(POS),
MAPQ = as.numeric(MAPQ),
PNEXT = as.numeric(PNEXT),
TLEN = as.numeric(TLEN)) %>%
##==== Exploring ====
(function(x){
filter(x, MAPQ > 30 & MAPQ < 55) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow()
return(x)
}) %>%
##==== Exploring ====
(function(x){
arrange(x, MAPQ) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head()
return(x)
}) %>%
##==== Exploring ====
(function(x){
filter(x, MAPQ > 30) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%
arrange(POS, MAPQ) %>% head()
return(x)
}) %>%
##==== Step 4: select rows that have insertions ====
(function(x){
select(x, CIGAR) %>%
laply(grep, pattern='[I]') %>%
slice(.data = x) %>%
(function(x){
x ->> df_with_Insertion
select(x, CIGAR) ->> CIGAR_with_Insertion
select(x, QNAME) ->> QNAME
select(x, POS) ->> POS
return(select(x, CIGAR))
})
}) %>%
##==== Step 5: split CIGAR by insertion letter 'I' ====
llply(str_split, 'I') %>%
##==== Step 6: Extract data from a nested list ====
`[[`('CIGAR') %>%
##==== Step 7: delete the last element ====
llply(head, -1) %>%
##==== Step 8: build calculation expression by replace remaining CIGAR letters with '+' ====
llply(gsub, pattern='[A-Z]', replacement='+') %>%
##==== Step 9: evalate the expression ====
(function(x){
eval_string = function(var1) eval(parse(text=var1)) ## define a function
eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
llply(x, eval_string_plus)
}) %>%
##==== Step 10: get positions ====
llply(cumsum) %>%
(function(x){
maxN = max(lengths(x))
newX = vector('numeric', length = maxN)
ldply(x, `[`, 1:maxN)
}) %>%
##==== Step 10: column names and row names ====
(function(x){
`colnames<-`(x, paste0('insert_', 1:ncol(x)))
## return(x)
}) %>%
##==== Step 11: add starting position =======
colwise(.fun=`+`)(unlist(POS)) %>%
##==== Step 11: restructure the data frame to get final results ====
mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
`[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))