This tutorial is a teaching material that I created for a workshop(https://github.com/MingChen0919/KBS_workshop_Michigan_2016/blob/master/fun_with_R.md).

Target file: sam file

SRR2426363.615  163 gi|254160123|ref|NC_012967.1|   3356120 10  71M1D8M1I125M23S    =   3356375 504 CTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCGGGTAACGTCAATTGCTGCGGTTATTAGCCACAACACCTTCCTCCCCGCTGAAAGTACTTTACAACCCGAAGGCCTTCTTCATACACGCGGCATGGCTGCATCAGGCTTGCGCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTCGTAGTCAGTACTGTGTCTGAGT    AAABBFF43AAAEGFGGGGGFGGFCGGFHHHGHHGGGG?GHHHHC?FC?G5GHGGGGHGF/EEHH?GGGCFHHHHGFGGGG?GBFGEF3?C?<GE?BGGHHHH/E?CCD/DGFDGHHGHBGHGFD.<A@C?.C<<0DGFFFHFFGEGF?D-AFGGGEEGGFF9FB?BBEA?D?.//9/9BFBFFFFFF/;BAFEFBBF.;9.9.A.;-99AB/;/9;/:99BBB//:9    XT:A:M  NM:i:13 SM:i:10 AM:i:10 XM:i:11 XO:i:2  XG:i:2  MD:Z:71^G0A2A0A0A7A1T0T0T1C0T0C111
SRR2426363.698  89  gi|254160123|ref|NC_012967.1|   4015323 0   168M1I33M   =   4015323 0   TGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAACGAGCAGTAAAACCTCTACAGGCTTGTAG  EA;0C00GFC:00C:/DGGCFD0D0DDC?GGCC.<<--/@@??1?11?1GCGDCDHFDFB@@1B<?@FB12DCFAEF>F11<>1DHE>>AEHFFGF1EF0EEE/>/B1B1HG1E>EFBFFFGFGFHHDGBEFAA/2GFGAEGDF1GG1GFB2ABGF1HGFCB1F2GGF0F0GEEAEDFG1DF1B1FABF3FFFAAC1?AAAA  XT:A:R  NM:i:2  SM:i:0  AM:i:0  X0:i:3  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:172T28 XA:Z:gi|254160123|ref|NC_012967.1|,-228028,168M1I33M,2;gi|254160123|ref|NC_012967.1|,+3355048,28M1I173M,2;
SRR2426363.1034 163 gi|254160123|ref|NC_012967.1|   3912080 29  39M1I137M   =   3912459 618 CTCTTTAGGTATTCCTTCGAACAAGATGCAAGAATAGACAAAAATGACAGCCCTTCTACGAGTGATTAGCCTGGTCGTGATTAGCGTGGTGGTGATTATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAAATCAAGCCTTAACGAACTAAGACCCCCGCACCG   ABBBBFFBFFFFGGGGGGGCGGGHGHFFFFCHHGHHHHFHFFAFGHFFFGEAEHHHHHGFDEEHHHHFHHHGFGGHFEGHHHHGF0B????EEFHHGGGHHFGHHHHGGHHEE?EC?DGFBGFHHEFFCGG//CGC0CGEB11=1>GG1>GGBGFB0DC.-.F0;CFHEGCCGCA@9   XT:A:U  NM:i:4  SM:i:29 AM:i:29 X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:9C24A107G33
CIGAR

CIGAR

Goals

Positions of all insertion/deletion

                Qname start_pos insert_1 insert_2 insert_3 insert_4
1      SRR2426363.615   3356120  3356201       NA       NA       NA
2      SRR2426363.698   4015323  4015492       NA       NA       NA
3     SRR2426363.1034   3912080  3912120       NA       NA       NA
4     SRR2426363.4217   4382662  4382826  4382838       NA       NA
5     SRR2426363.4732    979022   979071       NA       NA       NA
6     SRR2426363.6348   4017239  4017395       NA       NA       NA

Data manipulation skills with plyr/dplyr

#---  **ply(): a*ply, l*ply, d*ply
#---  select(): focus on a subset of variables
#---  filter(): focus on a subset of rows
#---  mutate(): add new columns
#---  arrange(): re-order the rows
#---  colwise(): functional programming example
#---  [[: subsetting
#---  ->: right assignment operator
#---  ->>: assign through parents environments
#---  %>%
#---  workflow

Let’s do it!

Load packages

p = c('plyr', 'dplyr', 'stringr')
install.packages(p)

library(plyr)
library(dplyr)
library(stringr)
rm(list=ls())

Get data

Option 1: read data remotely

myData = readLines('https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam')

Option 2: download data and then read data locally

curl -O https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam
myData = readLines('SRR2426363.mapped.sam')

Do everything in one single pipe line.

  • Pipeline starts with your data
  • Input -> Output: No intermediate variables generated
  • Pipeline is extensible
myData %>% tail(-1) %>% ## remove the first row because it contains field names
  (function(x){
    ## wrap the str_split_fixed() function so it returns a character
    str_split_fixed_2 = function(string, pattern, n){
      x = str_split_fixed(string, pattern, n)
      return(as.character(x))
    }
    ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
  }) %>% 
  (function(x){
    colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
                    'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
    rownames(x) = paste0('seq', 1:nrow(x))
    return(x)
  }) %>% 
  mutate(FLAG = as.numeric(FLAG),
         POS = as.numeric(POS),
         MAPQ   = as.numeric(MAPQ),
         PNEXT  = as.numeric(PNEXT),
         TLEN   = as.numeric(TLEN)) %>%
  (function(x){
    filter(x, MAPQ > 30 & MAPQ < 55) %>%
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow() 
    return(x)
  }) %>% 
  (function(x){
    arrange(x, MAPQ) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head() 
    return(x)
  }) %>%
  (function(x){
    filter(x, MAPQ > 30) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%  
      arrange(POS, MAPQ) %>% head() 
    return(x)
  }) %>% 
  (function(x){
    select(x, CIGAR) %>% 
      laply(grep, pattern='[I]') %>% 
      slice(.data = x) %>%
      (function(x){
        x ->> df_with_Insertion
        select(x, CIGAR) ->> CIGAR_with_Insertion
        select(x, QNAME) ->> QNAME 
        select(x, POS) ->> POS
        return(select(x, CIGAR))
      })
  }) %>% 
  llply(str_split, 'I') %>%
  `[[`('CIGAR') %>%
  llply(head, -1) %>%
  llply(gsub, pattern='[A-Z]', replacement='+') %>%
  (function(x){
    eval_string = function(var1) eval(parse(text=var1))  ## define a function
    eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
    llply(x, eval_string_plus)
  })  %>% 
  llply(cumsum) %>%
  (function(x){
    maxN = max(lengths(x))
    newX = vector('numeric', length = maxN)
    ldply(x, `[`, 1:maxN)
  }) %>% 
  (function(x){
    `colnames<-`(x, paste0('insert_', 1:ncol(x)))
    ## return(x)
  }) %>%
  colwise(.fun=`+`)(unlist(POS)) %>%
  mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
  `[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))

Let’s break it down!
##==== Step 1: split each line by tab and return a tabular data structure  ====
myData %>% tail(-1) %>% ## remove the first row because it contains field names
  (function(x){
    ## wrap the str_split_fixed() function so it returns a character
    str_split_fixed_2 = function(string, pattern, n){
      x = str_split_fixed(string, pattern, n)
      return(as.character(x))
    }
    ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
  }) %>% 
##==== Step 2:  set column names =======
  (function(x){
    colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
                    'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
    rownames(x) = paste0('seq', 1:nrow(x))
    return(x)
  }) %>% 
##==== Step:3 convert strings to numbers ====
  mutate(FLAG = as.numeric(FLAG),
         POS = as.numeric(POS),
         MAPQ   = as.numeric(MAPQ),
         PNEXT  = as.numeric(PNEXT),
         TLEN   = as.numeric(TLEN)) %>%
##==== Exploring  ====
  (function(x){
    filter(x, MAPQ > 30 & MAPQ < 55) %>%
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow() 
    return(x)
  }) %>% 
##==== Exploring ====
  (function(x){
    arrange(x, MAPQ) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head() 
    return(x)
  }) %>%
##==== Exploring ====
  (function(x){
    filter(x, MAPQ > 30) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%  
      arrange(POS, MAPQ) %>% head() 
    return(x)
  }) %>% 
##==== Step 4: select rows that have insertions ====
  (function(x){
    select(x, CIGAR) %>% 
      laply(grep, pattern='[I]') %>% 
      slice(.data = x) %>%
      (function(x){
        x ->> df_with_Insertion
        select(x, CIGAR) ->> CIGAR_with_Insertion
        select(x, QNAME) ->> QNAME 
        select(x, POS) ->> POS
        return(select(x, CIGAR))
      })
  }) %>%
##==== Step 5: split CIGAR by insertion letter 'I'  ====
  llply(str_split, 'I') %>%
##==== Step 6: Extract data from a nested list ====
  `[[`('CIGAR') %>%
##==== Step 7: delete the last element ====
  llply(head, -1) %>%
##==== Step 8: build calculation expression by replace remaining CIGAR letters with '+' ====
  llply(gsub, pattern='[A-Z]', replacement='+') %>%
##==== Step 9: evalate the expression ====
 (function(x){
    eval_string = function(var1) eval(parse(text=var1))  ## define a function
    eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
    llply(x, eval_string_plus)
  })  %>% 
##==== Step 10: get positions ====
 llply(cumsum) %>%
  (function(x){
    maxN = max(lengths(x))
    newX = vector('numeric', length = maxN)
    ldply(x, `[`, 1:maxN)
  }) %>% 
##==== Step 10: column names and row names ====
  (function(x){
    `colnames<-`(x, paste0('insert_', 1:ncol(x)))
    ## return(x)
  }) %>%
##==== Step 11: add starting position =======
colwise(.fun=`+`)(unlist(POS)) %>%
##==== Step 11: restructure the data frame to get final results ====
  mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
  `[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))
Copyright © 2017 Ming Chen. All rights reserved.