diff --git a/ProcessFASTQ.ipynb b/ProcessFASTQ.ipynb index c6a0225a76f30ee9d279ad3b48aebb2c6daf972d..3c9e707f9094c2c4b3fe003030e1e4ba5a82d6b5 100644 --- a/ProcessFASTQ.ipynb +++ b/ProcessFASTQ.ipynb @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 190, "metadata": { "collapsed": false }, @@ -95,11 +95,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 191, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Available files :\n", + "\n", + "Undetermined_lane7_pair1\n", + "flowcell261_lane8_pair1_CAGATC \t\t Processsed\n", + "flowcell261_lane8_pair1_TGACCA \t\t Processsed\n", + "flowcell362_lane4_pair1_ACAGTG \t\t Processsed\n", + "flowcell362_lane4_pair1_ACTTGA \t\t Processsed\n", + "flowcell362_lane4_pair1_CAGATC \t\t Processsed\n", + "flowcell362_lane4_pair1_TGACCA \t\t Processsed\n", + "flowcell362_lane4_pair1_Undetermined\n", + "flowcell384_lane7_pair1_ACAGTG \t\t Processsed\n", + "flowcell384_lane7_pair1_ACTTGA \t\t Processsed\n", + "flowcell384_lane7_pair1_CAGATC \t\t Processsed\n", + "flowcell384_lane7_pair1_GATCAG \t\t Processsed\n", + "flowcell384_lane7_pair1_TGACCA \t\t Processsed\n", + "testing\n" + ] + } + ], "source": [ "# This lib is needed to parse fastq easily\n", "from Bio import SeqIO\n", @@ -122,12 +146,12 @@ "# fname = \"flowcell362_lane4_pair1_ACTTGA\"\n", "# fname = \"flowcell362_lane4_pair1_CAGATC\"\n", "# fname = \"flowcell362_lane4_pair1_TGACCA\"\n", - "# fname = \"flowcell362_lane4_pair1_Undetermined\"\n", + "fname = \"flowcell362_lane4_pair1_Undetermined\"\n", "# fname = \"flowcell384_lane7_pair1_ACAGTG\"\n", "# fname = \"flowcell384_lane7_pair1_ACTTGA\"\n", "# fname = \"flowcell384_lane7_pair1_CAGATC\"\n", "# fname = \"flowcell384_lane7_pair1_GATCAG\"\n", - "fname = \"flowcell384_lane7_pair1_TGACCA\"\n", + "# fname = \"flowcell384_lane7_pair1_TGACCA\"\n", "\n", "# Print available files in 0-Raws/ directory\n", "\n", @@ -184,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 192, "metadata": { "collapsed": false }, @@ -207,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 193, "metadata": { "collapsed": false }, @@ -215,10 +239,10 @@ { "data": { "text/plain": [ - "'flowcell384_lane7_pair1_TGACCA.fastq.gz'" + "'flowcell362_lane4_pair1_Undetermined.fastq.gz'" ] }, - "execution_count": 167, + "execution_count": 193, "metadata": {}, "output_type": "execute_result" } @@ -242,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 194, "metadata": { "collapsed": false }, @@ -252,12 +276,12 @@ "output_type": "stream", "text": [ "\n", - "flowcell384_lane7_pair1_TGACCA\n", + "flowcell362_lane4_pair1_Undetermined\n", "\n", - "@SN279:498:C88PKACXX:7:1109:1699:2220 1:N:0:TGACCA\n", - "NGAGGTGCACAATCGACCGATCCTGCTGTAGGCACCATCAATAGATCGGAA\n", + "@SN279:493:C84L3ACXX:4:2309:1848:2226 1:N:0:TTCCGT\n", + "NCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTC\n", "+\n", - "#1=D;DDDHHFHHIIIIIIIIIIIIIIIIHIIIIFHIIIIIHHIIIIIIII\n" + "#4=DDDDDHHHHHIHFHJJJJJJJJIJJJHIJJJJJJJJJJJJJJJIJJJJ\n" ] } ], @@ -279,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 195, "metadata": { "collapsed": false }, @@ -288,7 +312,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing will take approximatively 0 hour(s) 44 minute(s) and 38 second(s)\n" + "Processing will take approximatively 0 hour(s) 3 minute(s) and 31 second(s)\n" ] } ], @@ -334,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": null, "metadata": { "collapsed": false }, @@ -346,21 +370,11 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This is cutadapt 1.9.1 with Python 3.5.1\n", - "Command line parameters: -a CTGTAGGCACCATCAATAGATCGGAA -o 1-Cutadapted/flowcell384_lane7_pair1_TGACCA.fastq.gz --quiet flowcell384_lane7_pair1_TGACCA.fastq.gz\n", - "Trimming 1 adapter with at most 10.0% errors in single-end mode ...\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "source ./source\n", @@ -378,19 +392,11 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cutadapt run time : 0:38:47.798753\n" - ] - } - ], + "outputs": [], "source": [ "# Store current time\n", "after = datetime.datetime.now()\n", @@ -403,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": null, "metadata": { "collapsed": true }, @@ -424,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": null, "metadata": { "collapsed": false }, @@ -436,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": null, "metadata": { "collapsed": false }, @@ -451,19 +457,11 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Zcat run time : 0:00:40.103272\n" - ] - } - ], + "outputs": [], "source": [ "# Store current time\n", "after = datetime.datetime.now()\n", @@ -489,7 +487,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": null, "metadata": { "collapsed": false }, @@ -501,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": null, "metadata": { "collapsed": true }, @@ -526,19 +524,11 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Filtering run time : 0:35:14.961783\n" - ] - } - ], + "outputs": [], "source": [ "# Store current time\n", "after = datetime.datetime.now()\n", @@ -574,28 +564,12 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": null, "metadata": { "collapsed": false, "scrolled": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Time loading forward index: 00:00:00\n", - "Time loading mirror index: 00:00:00\n", - "End-to-end 2/3-mismatch full-index search: 00:04:29\n", - "# reads processed: 46620824\n", - "# reads with at least one reported alignment: 33736079 (72.36%)\n", - "# reads that failed to align: 12884745 (27.64%)\n", - "Reported 33736079 alignments to 1 output stream(s)\n", - "Time searching: 00:04:29\n", - "Overall time: 00:04:29\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "source ./source\n", @@ -644,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": null, "metadata": { "collapsed": false }, @@ -657,24 +631,11 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "@SQ\tSN:tW(UCA)Q\tLN:74\n", - "\n", - "@SQ\tSN:tY(GUA)Q\tLN:84\n", - "\n", - "@PG\tID:Bowtie\tVN:1.1.2\tCL:\"bowtie --wrapper basic-0 -S -v 3 -p 8 --time --best ref/2-Indexes/Yeast-Noncoding/Yeast-Noncoding 3-Filtered/flowcell384_lane7_pair1_TGACCA.fastq 4-Bowtied/flowcell384_lane7_pair1_TGACCA.sam\"\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "with open(\"3-Filtered/\" +fname+\".fastq\",\"r\") as filtered, \\\n", " open(\"4-Bowtied/\" +fname+\".sam\",\"r\") as matches, \\\n", @@ -702,19 +663,11 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Filtering non-codant tRNA run time : 0:25:12.216325\n" - ] - } - ], + "outputs": [], "source": [ "# Store current time\n", "after = datetime.datetime.now()\n", @@ -736,27 +689,11 @@ }, { "cell_type": "code", - "execution_count": 184, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Time loading forward index: 00:00:00\n", - "Time loading mirror index: 00:00:00\n", - "End-to-end 2/3-mismatch full-index search: 00:01:06\n", - "# reads processed: 12884745\n", - "# reads with at least one reported alignment: 10864699 (84.32%)\n", - "# reads that failed to align: 2020046 (15.68%)\n", - "Reported 10864699 alignments to 1 output stream(s)\n", - "Time searching: 00:01:06\n", - "Overall time: 00:01:06\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "source ./source\n", @@ -781,7 +718,7 @@ }, { "cell_type": "code", - "execution_count": 185, + "execution_count": null, "metadata": { "collapsed": true }, @@ -812,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 186, + "execution_count": null, "metadata": { "collapsed": false }, @@ -856,7 +793,7 @@ }, { "cell_type": "code", - "execution_count": 187, + "execution_count": null, "metadata": { "collapsed": false },