Automatic Update

988cd29d · Julien Cornut · 279a5992 · 988cd29d
Commit 988cd29d authored 9 years ago by Julien Cornut
--- a/ProcessFASTQ.ipynb
+++ b/ProcessFASTQ.ipynb
@@ -95,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 191,
+   "execution_count": 214,
   "metadata": {
    "collapsed": false
   },
@@ -114,7 +114,7 @@
      "flowcell362_lane4_pair1_ACTTGA \t\t Processsed\n",
      "flowcell362_lane4_pair1_CAGATC \t\t Processsed\n",
      "flowcell362_lane4_pair1_TGACCA \t\t Processsed\n",
-      "flowcell362_lane4_pair1_Undetermined\n",
+      "flowcell362_lane4_pair1_Undetermined \t\t Processsed\n",
      "flowcell384_lane7_pair1_ACAGTG \t\t Processsed\n",
      "flowcell384_lane7_pair1_ACTTGA \t\t Processsed\n",
      "flowcell384_lane7_pair1_CAGATC \t\t Processsed\n",
@@ -726,7 +726,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 209,
   "metadata": {
    "collapsed": false
   },
@@ -760,11 +760,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 210,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Time loading forward index: 00:00:00\n",
+      "Time loading mirror index: 00:00:00\n",
+      "End-to-end 2/3-mismatch full-index search: 00:00:30\n",
+      "# reads processed: 2599991\n",
+      "# reads with at least one reported alignment: 439705 (16.91%)\n",
+      "# reads that failed to align: 2160286 (83.09%)\n",
+      "Reported 439705 alignments to 1 output stream(s)\n",
+      "Time searching: 00:00:30\n",
+      "Overall time: 00:00:30\n"
+     ]
+    }
+   ],
   "source": [
    "%%bash\n",
    "source ./source\n",
@@ -789,7 +805,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 211,
   "metadata": {
    "collapsed": true
   },
@@ -820,7 +836,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 212,
   "metadata": {
    "collapsed": false
   },
@@ -864,7 +880,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 213,
   "metadata": {
    "collapsed": false
   },

 %% Cell type:markdown id: tags:
 # Notebook refact of Process.py
 %% Cell type:markdown id: tags:
 <p style="background-color:#B0E0E6; color:#000000;padding:5px">
 This Notebook is used to process fastq files, turning them into exploitable wig files.
 <br><br>
 Only Python3 is available here.
 <br><br>
 An automatic back up it set up every 10 minutes, using crontab and git.
 <br><br>
 A "source" file is written to pass environnement variable to each bash magic cells.
 <br><br>
 If you want to make any changes, please comment the corresponding line, or convert the cell in RAW format using the menu.
 <br><br>
 You can also duplicate the Notebook if you need heavy changes.
 <br><br>
 Please don't touch the 0-Raw/ directory files.
 </p>
 %% Cell type:markdown id: tags:
 <p style="background-color:#FFAAAA; color:#000000;padding:5px">
 Copy the content of this template cell to make your comments more obvious and clear in the notebook.
 </p>
 %% Cell type:code id: tags:
 ``` python
 %%bash
 # Binaries are installed with Anaconda's package manager (Continuum Analytics)
 # Print version of binaries
 echo
 echo -e "Version of Cutadapt \t : $(cutadapt --version)"
 echo -e "Version of Bowtie \t : $(bowtie --version | head -n 1 | tail -c 6)"
 ```
 %% Output
    Version of Cutadapt 	 : 1.9.1
    Version of Bowtie 	 : 1.1.2
 %% Cell type:markdown id: tags:
 ## User manual
 * Use ctrl+enter to run the focused cell
 * Double click a cell to change it's content
 * Use cell->run all to run all cells
 * Use the "+" button to add a cell
 * If you need to comment a cell, use drop menu to convert it to Raw NB Convert cell
 * Use the scissor button to delete a cell
 * Don't forget to save your work
 * Use this notebook by simply cut/paste the filename of the file you need to process into the fname variable in the cell below
 * Availables files are listed under the first Python cell
 * Depending on the size, the notebook will take a long time to process it
 * Use the magic "%%bash" and newline to start a bash interpreted cell
 * If you get an "notebook has changed" message, use the RELOAD button, I'm probably working on it !
 %% Cell type:markdown id: tags:
 Import necessary libraries
 %% Cell type:code id: tags:
 ``` python
 # This lib is needed to parse fastq easily
 from Bio import SeqIO
 # This lib is used to avoid using a bash magic cell to copy a file
 from shutil import copyfile
 # Used to... list directory
 from os import listdir, path
 # Check the time taken by a function to run
 import datetime
 # Enter/Uncomment here the name of the file you want to process
 # fname = "Undetermined_lane7_pair1"
 # fname = "flowcell261_lane8_pair1_CAGATC"
 # fname = "flowcell261_lane8_pair1_TGACCA"
 # fname = "flowcell362_lane4_pair1_ACAGTG"
 # fname = "flowcell362_lane4_pair1_ACTTGA"
 # fname = "flowcell362_lane4_pair1_CAGATC"
 # fname = "flowcell362_lane4_pair1_TGACCA"
 fname = "flowcell362_lane4_pair1_Undetermined"
 # fname = "flowcell384_lane7_pair1_ACAGTG"
 # fname = "flowcell384_lane7_pair1_ACTTGA"
 # fname = "flowcell384_lane7_pair1_CAGATC"
 # fname = "flowcell384_lane7_pair1_GATCAG"
 # fname = "flowcell384_lane7_pair1_TGACCA"
 # Print available files in 0-Raws/ directory
 print("\nAvailable files :\n")
 fn = {n.split('.')[0] for n in listdir("0-Raws/")} # Basename
 wn = {n               for n in listdir("7-WIGs/")} # P and M files
 lst = (n+" \t\t Processsed"                 \
       if (n+".P.wig" and n+".M.wig") in wn \
       else n                               \
       for n in sorted(fn))
 for l in lst:
    print(l)
 ```
 %% Output
    Available files :
    Undetermined_lane7_pair1
    flowcell261_lane8_pair1_CAGATC 		 Processsed
    flowcell261_lane8_pair1_TGACCA 		 Processsed
    flowcell362_lane4_pair1_ACAGTG 		 Processsed
    flowcell362_lane4_pair1_ACTTGA 		 Processsed
    flowcell362_lane4_pair1_CAGATC 		 Processsed
    flowcell362_lane4_pair1_TGACCA 		 Processsed
-    flowcell362_lane4_pair1_Undetermined
+    flowcell362_lane4_pair1_Undetermined 		 Processsed
    flowcell384_lane7_pair1_ACAGTG 		 Processsed
    flowcell384_lane7_pair1_ACTTGA 		 Processsed
    flowcell384_lane7_pair1_CAGATC 		 Processsed
    flowcell384_lane7_pair1_GATCAG 		 Processsed
    flowcell384_lane7_pair1_TGACCA 		 Processsed
    testing
 %% Cell type:markdown id: tags:
 ## Build working tree
 Necessary step if you run this notebook on a local laptop or for the first time. If so, please convert cell in code.
 %% Cell type:raw id: tags:
 %%bash
 mkdir -p 0-Raws
 mkdir -p 1-Cutadapted
 mkdir -p 2-Gunzipped
 mkdir -p 3-Filtered
 mkdir -p 4-Bowtied
 mkdir -p 5-ncRNA-Removed
 mkdir -p 6-Genome-Aligned
 mkdir -p 7-WIGs
 %% Cell type:markdown id: tags:
 ## Set up environnement
 * Create a source file to share the FILENAME bash environnement variable (bash cells don't keep scope)
 * Copy the file we're working on to root of Bioinf dir to work safer
 %% Cell type:code id: tags:
 ``` python
 # Build export string used by bash in the "source" file
 export_line = "export FILENAME="+fname
 # Write a local file ("source") to share env. variable between cells
 with open("source","w") as f:
    f.write(export_line)
 ```
 %% Cell type:markdown id: tags:
 ## Copy to root dir
 %% Cell type:code id: tags:
 ``` python
 # Copy file from 0-Raws to root dir of notebook to work safer
 copyfile("0-Raws/"+fname+".fastq.gz", fname+".fastq.gz")
 ```
 %% Output
    'flowcell362_lane4_pair1_Undetermined.fastq.gz'
 %% Cell type:markdown id: tags:
 ## Check
 * File exists
 * Is not empty
 * Env. var. is set up
 * Print first entry of the file
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 echo
 echo $FILENAME
 echo
 zmore $FILENAME.fastq.gz | head -n 4
 ```
 %% Output
    flowcell362_lane4_pair1_Undetermined
    @SN279:493:C84L3ACXX:4:2309:1848:2226 1:N:0:TTCCGT
    NCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTC
    +
    #4=DDDDDHHHHHIHFHJJJJJJJJIJJJHIJJJJJJJJJJJJJJJIJJJJ
 %% Cell type:markdown id: tags:
 ## Estimate Cutadapt processing time
 %% Cell type:code id: tags:
 ``` python
 # Get file size in bytes
 fsize = path.getsize(fname+".fastq.gz")
 # Takes more or less 1 hour to process 2.45Gib.
 bytes_per_second = (2.45*pow(10,9))/3600
 # How many seconds for this file
 seconds = fsize/bytes_per_second
 # Estimated time
 h,r = divmod(seconds,3600)
 m,s = divmod(r,60)
 print("Processing will take approximatively {0:.0f} hour(s) {1:.0f} minute(s) and {2:.0f} second(s)"\
      .format(h,m,s))
 ```
 %% Output
    Processing will take approximatively 0 hour(s) 3 minute(s) and 31 second(s)
 %% Cell type:markdown id: tags:
 <br>
 <div style="text-align:center;font-size:300%">Cutadapt</div>
 <div style="text-align:center">Call cutadapt binary file to strip strands</div>
 %% Cell type:markdown id: tags:
 | Option    | Effect                 |
 |    -:     |   :-                   |
 |-a ADAPTER | Enter adapter sequence |
 |-o OUTPUT  | Indicate output file   |
 |--quiet    | No long report         |
 |INPUT      | Enter input file       |
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 before = datetime.datetime.now()
 ```
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 # Export adapter we want to cut
 export ADAPTER=CTGTAGGCACCATCAATAGATCGGAA
 # Run binary
 cutadapt                               \
    -a $ADAPTER                        \
    -o 1-Cutadapted/$FILENAME.fastq.gz \
    --quiet                            \
    $FILENAME.fastq.gz
 ```
 %% Output
    This is cutadapt 1.9.1 with Python 3.5.1
    Command line parameters: -a CTGTAGGCACCATCAATAGATCGGAA -o 1-Cutadapted/flowcell362_lane4_pair1_Undetermined.fastq.gz --quiet flowcell362_lane4_pair1_Undetermined.fastq.gz
    Trimming 1 adapter with at most 10.0% errors in single-end mode ...
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 after = datetime.datetime.now()
 # Difference
 delta = after-before
 print("Cutadapt run time : {0}".format(delta))
 ```
 %% Output
    Cutadapt run time : 0:03:24.043250
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 rm $FILENAME.fastq.gz
 ```
 %% Cell type:markdown id: tags:
 ## Unzip resulting compressed fastq
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 before = datetime.datetime.now()
 ```
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 # Unzip compressed file and output it in mid-process directory
 zcat 1-Cutadapted/$FILENAME.fastq.gz > 2-Gunzipped/$FILENAME.fastq
 ```
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 after = datetime.datetime.now()
 # Difference
 delta = after-before
 print("Zcat run time : {0}".format(delta))
 ```
 %% Output
    Zcat run time : 0:00:03.973413
 %% Cell type:markdown id: tags:
 ## Remove sequences with less than needed nucleotides
 Bowtie drops them anyway...
 This step takes an eternity to filter for big files...
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 before = datetime.datetime.now()
 ```
 %% Cell type:code id: tags:
 ``` python
 # Enter the minimum needed sequence length
 min_seq_length = 4
 with open("2-Gunzipped/"+fname+".fastq", "r") as handle, \
     open("3-Filtered/" +fname+".fastq","w")  as output:
    # Iterate through each records in fastq file
    seq_iter = SeqIO.parse(handle,"fastq")
    # Generator over sequences's length greater than min_seq_length
    filt_iter = (rec for rec in seq_iter \
                 if (len(rec.seq) > min_seq_length))
    # Write back filtered fastq
    SeqIO.write(filt_iter,output,"fastq")
 ```
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 after = datetime.datetime.now()
 # Difference
 delta = after-before
 print("Filtering run time : {0}".format(delta))
 ```
 %% Output
    Filtering run time : 0:02:37.166601
 %% Cell type:markdown id: tags:
 <br>
 <div style="text-align:center;font-size:300%">Bowtie</div>
 <br>
 %% Cell type:markdown id: tags:
 | Option    | Effect                 |
 |    -:     |   :-                   |
 | -S        | Output as a SAM file   |
 | -v 3      | report end-to-end hits w/ <=v mismatches; ignore qualities |
 | -p 4      | Use 4 processors       |
 | --best    | hits guaranteed best stratum; ties broken by quality |
 | INPUT     | Input file             |
 | OUTPUT    | Output file            |
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 bowtie                                                   \
    -S                                                   \
    -v 3                                                 \
    -p 8                                                 \
    --time                                               \
    --best                                               \
    ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding        \
    3-Filtered/$FILENAME.fastq                           \
    4-Bowtied/$FILENAME.sam
 ```
 %% Output
    Time loading forward index: 00:00:00
    Time loading mirror index: 00:00:00
    End-to-end 2/3-mismatch full-index search: 00:00:34
    # reads processed: 3112673
    # reads with at least one reported alignment: 512682 (16.47%)
    # reads that failed to align: 2599991 (83.53%)
    Reported 512682 alignments to 1 output stream(s)
    Time searching: 00:00:34
    Overall time: 00:00:34
 %% Cell type:markdown id: tags:
 ## Remove non-codant RNA
 Original script burns 416 lines (0-415). Doing so strip the first non-header entry. Is it right ? Here I strip 415 exactly.
 This block creates a generator that filters records according to the corresponding sam file with '4' in the field used in the original script. This method takes the bet that the first non-header line in the sam file matches the first line of the fastq. This is ugly. Don't do this. Need refact with browsing the sam file correctly.
 The '4' in the flag field of the SAM file means that the read has no reported alignment. In this case, every aligned read means a match with non-codant tRNA index. So we keep only "mismatches" as they represent all reads that don't match with non-codant, in other words the reads we are interested in.
 > Sum of all applicable flags. Flags relevant to Bowtie are:
 > * 1 The read is one of a pair
 > * 2 The alignment is one end of a proper paired-end alignment
 > * 4 The read has no reported alignments
 > * 8 The read is one of a pair and has no reported alignments
 > * 16 The alignment is to the reverse reference strand
 > * 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
 > * 64 The read is the first (#1) mate in a pair
 > * 128 The read is the second (#2) mate in a pair
 > Thus, an unpaired read that aligns to the reverse reference strand will have flag 16.
 > A paired-end read that aligns and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1).
 (From Bowtie manual, http://bowtie-bio.sourceforge.net/manual.shtml#sam-bowtie-output)
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 before = datetime.datetime.now()
 ```
 %% Cell type:code id: tags:
 ``` python
 with open("3-Filtered/"     +fname+".fastq","r") as filtered, \
     open("4-Bowtied/"      +fname+".sam","r")   as matches,  \
     open("5-ncRNA-Removed/"+fname+".fastq","w") as substracted:
    # Iterator over fastq file
    filt_iter = SeqIO.parse(filtered,"fastq")
    # Strip header (as in original script)
    for _ in range(415-3): matches.readline()
    # Check last lines
    print(matches.readline())
    print(matches.readline())
    print(matches.readline())
    # Generator over fastq where the corresponding sam field is 4,
    # meaning no reported alignment
    sub_iter = (rec for rec in filt_iter \
                if matches.readline().split('\t')[1] == '4')
    # Write back fastq
    SeqIO.write(sub_iter,substracted,"fastq")
 ```
 %% Output
    @SQ	SN:tW(UCA)Q	LN:74
    @SQ	SN:tY(GUA)Q	LN:84
    @PG	ID:Bowtie	VN:1.1.2	CL:"bowtie --wrapper basic-0 -S -v 3 -p 8 --time --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/flowcell362_lane4_pair1_Undetermined.fastq 4-Bowtied/flowcell362_lane4_pair1_Undetermined.sam"
 %% Cell type:code id: tags:
 ``` python
 # Store current time
 after = datetime.datetime.now()
 # Difference
 delta = after-before
 print("Filtering non-codant tRNA run time : {0}".format(delta))
 ```
 %% Output
    Filtering non-codant tRNA run time : 0:02:31.047781
 %% Cell type:markdown id: tags:
 # Map Genome
 Invoke bowtie to map entire Cerevisiae genome. Same options as first run.
 %% Cell type:code id: tags:
 ``` python
 %%bash
 source ./source
 bowtie                                                   \
    -S                                                   \
    -v 2                                                 \
    -p 8                                                 \
    --time                                               \
    --best                                               \
    ref/2-Indexes/SCerevisiae/s_cerevisiae               \
    5-ncRNA-Removed/$FILENAME.fastq                      \
    6-Genome-Aligned/$FILENAME.sam
 ```
+%% Output
+    Time loading forward index: 00:00:00
+    Time loading mirror index: 00:00:00
+    End-to-end 2/3-mismatch full-index search: 00:00:30
+    # reads processed: 2599991
+    # reads with at least one reported alignment: 439705 (16.91%)
+    # reads that failed to align: 2160286 (83.09%)
+    Reported 439705 alignments to 1 output stream(s)
+    Time searching: 00:00:30
+    Overall time: 00:00:30
 %% Cell type:markdown id: tags:
 ## Define Lengths Dictionnary
 %% Cell type:code id: tags:
 ``` python
 # Those are Chromosome lengths according to original script
 LengthDict = {
    'chrI'   :230218,  'chrII'  :813184, 'chrIII':316620,  \
    'chrIV'  :1531933, 'chrV'   :576874, 'chrVI':270161,   \
    'chrVII' :1090940, 'chrVIII':562643, 'chrIX':439888,   \
    'chrX'   :745751,  'chrXI'  :666816, 'chrXII':1078177, \
    'chrXIII':924431,  'chrXIV' :784333, 'chrXV':1091291,  \
    'chrXVI' :948066,  'chrM'   :85779                     \
 }
 ```
 %% Cell type:markdown id: tags:
 ##  Write WIGs
 * This an ugly copy/paste of original script.
 * Script build huge lists... Bêêêh.
 * Definitely need refact with generators.
 %% Cell type:code id: tags:
 ``` python
 PlusDict   = {}
 MinusDict  = {}
 Count = 0
 for IdxC in LengthDict:
    PlusDict[IdxC]  = [0]*LengthDict[IdxC]
    MinusDict[IdxC] = [0]*LengthDict[IdxC]
 with open("6-Genome-Aligned/"+fname+".sam","r") as file:
    for line in file:
        if line[0] != '@':
            Split = line.split('\t')
            if Split[1] == '16':
                MinusDict[Split[2]][int(Split[3])] += 1
                Count += 1
            elif Split[1] == '0':
                PlusDict[Split[2]][int(Split[3])] += 1
                Count += 1
            else:
                pass
 ```
 %% Cell type:markdown id: tags:
 # Next step makes the WIGs
 * This an ugly copy/paste of original script.
 * Same problem
 %% Cell type:code id: tags:
 ``` python
 with open("7-WIGs/"+fname+".P.wig","w") as P, \
     open("7-WIGs/"+fname+".M.wig","w") as M:
    P.write('track name=tracklabel totalCounts=' \
            + str(Count)                         \
            + ' viewLimits=-5:5 color=79,159,36\n')
    M.write('track name=tracklabel totalCounts=' \
            + str(Count)                         \
            + ' viewLimits=-5:5 color=79,159,36\n')
    for IdxC in PlusDict:
        P.write('fixedStep  chrom=' + IdxC + '  start=1  step=1\n')
        M.write('fixedStep  chrom=' + IdxC + '  start=1  step=1\n')
        for IdxN in range(0, len(PlusDict[IdxC])):
            P.write(str(PlusDict[IdxC][IdxN])  + '\n')
            M.write(str(MinusDict[IdxC][IdxN]) + '\n')
 ```
 %% Cell type:markdown id: tags:
 # WIGs files are now available under 7-WIGs/ directory