## generate last database lastdb -P8 -uNEAR -R01 barcode_IFNAcons_Actb.fa barcode_IFNAcons_Actb.fa ## train last for optimal matching last-train -Q 1 -P8 barcode_IFNAcons_Actb.fa basecalled_pass_all.fastq ## map sequences to index lastal -P 10 -K 1 -C 1 -Q 1 -M -p barcode_ifna.par barcode_IFNAcons_Actb.fa basecalled_pass_all.fastq | gzip > last_pass_allBC_vs_BIA.maf.gz lastal -P 10 -K 1 -C 1 -Q 1 -M -p barcode_ifna.par barcode_IFNAcons_Actb.fa basecalled_fail_all.fastq | gzip > last_fail_allBC_vs_BIA.maf.gz ## quantify matches zcat last_pass_allBC_vs_BIA.maf.gz | ~/scripts/maf_bcsplit.pl -seqfile basecalled_pass_all.fasta | gzip > locations_pass_allBC.tsv.gz zcat last_fail_allBC_vs_BIA.maf.gz | ~/scripts/maf_bcsplit.pl -seqfile basecalled_fail_all.fasta | gzip > locations_fail_allBC.tsv.gz ## aggregate location matches zcat locations_fail_allBC.tsv.gz | sort -k 1,1r -k 4,4n -k 5,5n | ~/scripts/posAggregate.pl | gzip > aggregated_locations_fail_allBC.tsv.gz zcat locations_pass_allBC.tsv.gz | sort -k 1,1r -k 4,4n -k 5,5n | ~/scripts/posAggregate.pl | gzip > aggregated_locations_pass_allBC.tsv.gz ## generate sequence overviews zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | perl -pe 's/\[RC\]//g' | sort | uniq -c | sort -rn > aggregated_overview_counts_pass_allBC.txt zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | perl -pe 's/\[RC\]//g' | sort | uniq -c | sort -rn > aggregated_overview_counts_fail_allBC.txt ## including reverse complement zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | sort | uniq -c | sort -rn | gzip > aggregated_overviewRC_counts_pass_allBC.txt.gz zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | sort | uniq -c | sort -rn | gzip > aggregated_overviewRC_counts_fail_allBC.txt.gz ## excluding ONT adapter sequences zgrep -v -e 'adapt' -e 'Hairpin' aggregated_locations_pass_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | sort | uniq -c | sort -rn | gzip > aggregated_nonONT_overviewRC_counts_pass_allBC.txt.gz zgrep -v -e 'adapt' -e 'Hairpin' aggregated_locations_fail_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | sort | uniq -c | sort -rn | gzip > aggregated_nonONT_overviewRC_counts_fail_allBC.txt.gz ## excluding ONT adapter sequences and ignoring RC zgrep -v -e 'adapt' -e 'Hairpin' aggregated_locations_pass_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | perl -pe 's/\[RC\]//g' | sort | uniq -c | sort -rn | gzip > aggregated_nonONT_overview_counts_pass_allBC.txt.gz zgrep -v -e 'adapt' -e 'Hairpin' aggregated_locations_fail_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | perl -pe 's/\[RC\]//g' | sort | uniq -c | sort -rn | gzip > aggregated_nonONT_overview_counts_fail_allBC.txt.gz ## find specific chimeric read zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2 | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n".@F[0].",\""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | perl -pe 's/\[RC\]//g' | grep 'Actb.*IFNA_consensus.*NB09' ## get total mapped read count (zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2; zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2) | awk '{print $1}' | uniq | sort | uniq | wc -l ## [output: 264899] ## get total mapped IFNA read count (zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2; zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2) | grep '\(IFNA\|NB04\|NB09\)' | \ awk '{print $1}' | uniq | sort | uniq | wc -l ## [output: 113524] ## collect names for IFNA reads, excluding definitely chimeric reads [108241 reads] (zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2; zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2;) | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n$F[0] \""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | \ tail -n +2 | grep -v '\(Actb\|IFN[^A]\|NB05\|NB06\|NB10\|NB11\)' | grep '\(IFNA\|NB04\|NB09\)' | \ grep -v '\(NB04.*NB09\|NB09.*NB04\)' | gzip > readNames_IFNA_consensus.txt.gz ## create per-read aggregated alignment string (zcat aggregated_locations_fail_allBC.tsv.gz | tail -n +2; zcat aggregated_locations_pass_allBC.tsv.gz | tail -n +2;) | \ perl -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n$F[0] \""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | tail -n +2 | gzip > aggregated_alignments_all.txt.gz ## work out NB04/NB09 mappings (zgrep 'NB04' aggregated_alignments_all.txt.gz | grep -v 'NB[01][^4]' | perl -pe 's/\".*$/NB04/'; zgrep 'NB09' aggregated_alignments_all.txt.gz | grep -v 'NB[01][^9]' | perl -pe 's/\".*$/NB09/') | gzip > read_quick_ON_mappings.txt.gz ## map reads to IFNA lastal -P 10 -l 20 -Q 1 -j 7 -p ifna_only.par IFN_All.fa <(pv basecalled_IFNA_consensus.fastq) | gzip > LAST_IFNAmapped_vs_IFNAonly.maf.gz ## convert to more easily-parsed CSV file (echo "read,start,mlen,strand,rlen,mname,evalue"; zless LAST_IFNAmapped_vs_IFNAonly.maf.gz | \ perl -ane 'if($F[0] eq "a"){$F[3] =~ s/^E=//; print "\n$F[3]"} if($F[0] eq "s"){print(" $F[1] ", join(" ",@F[2..5]))}' | \ perl -lane 'print(join(",",@F[(6,7,8,9,10,1,,0)]))' | tail -n +2) | gzip > LAST_IFNAmapped_vs_IFNAonly.csv.gz ## process with R (see IFNA_isoforms.r) ### Albacore-called reads ## map sequences to index lastal -P 10 -K 1 -C 1 -Q 1 -M -p barcode_ifna.par barcode_IFNAcons_Actb.fa <(pv albacored_all.fastq) | gzip > albacored_all_vs_BIA.maf.gz ## quantify matches pv albacored_all_vs_BIA.maf.gz | zcat | ~/scripts/maf_bcsplit.pl | sort -t',' -k 1,1r -k 4,4n -k 5,5n | gzip > locations_albacored_all_vs_BIA.csv.gz ## aggregate location matches pv locations_albacored_all_vs_BIA.csv.gz | zcat | ~/scripts/posAggregate.pl | gzip > aggregated_locations_albacored_all_vs_BIA.csv.gz ## create per-read aggregated alignment string pv aggregated_locations_albacored_all_vs_BIA.csv.gz | zcat | tail -n +2 | \ perl -F',' -ane '$F[2] =~ s/-/[RC]/; $F[2] =~ s/\+//; if($F[0] ne $old){print "\"\n$F[0] \""} else {print " - "} print(@F[1..2]); $old=$F[0]; END{print "\"\n"}' | tail -n +2 | gzip > aggregated_alignments_albacored_all.txt.gz ## sort based on alignment string length pv aggregated_alignments_albacored_all.txt.gz | zcat | perl -lane '{$line=$_; s/^.*? //; print length($_)." ".$line}' | \ sort -rn -k 1,1 | perl -pe 's/^[0-9]+ //' | gzip > lengthSorted_aggregated_alignments_albacored_all.txt.gz ## isolate IFNA-mapped reads pv albacored_all.fastq | ~/scripts/fastx-fetch.pl -i reads_IFNA_albacored.txt.gz > albacored_IFNA_consensus.fastq ## map reads to IFNA paralogs lastal -P 10 -l 20 -Q 1 -j 7 -p ifna_only.par IFN_All.fa <(pv albacored_IFNA_consensus.fastq) | gzip > LAST_albacored_IFNAmapped_vs_IFNAonly.maf.gz ## convert to more easily-parsed CSV file (echo "read,start,mlen,strand,rlen,mname,evalue"; zless LAST_albacored_IFNAmapped_vs_IFNAonly.maf.gz | \ perl -ane 'if($F[0] eq "a"){$F[3] =~ s/^E=//; print "\n$F[3]"} if($F[0] eq "s"){print(" $F[1] ", join(" ",@F[2..5]))}' | \ perl -lane 'print(join(",",@F[(6,7,8,9,10,1,,0)]))' | tail -n +2) | gzip > LAST_albacored_IFNAmapped_vs_IFNAonly.csv.gz ## work out NB04/NB09 mappings (zgrep 'NB04' aggregated_alignments_albacored_all.txt.gz | grep -v 'NB[01][^4]' | perl -pe 's/\".*$/NB04/'; zgrep 'NB09' aggregated_alignments_albacored_all.txt.gz | grep -v 'NB[01][^9]' | perl -pe 's/\".*$/NB09/') | gzip > read_albacored_quick_ON_mappings.txt.gz ## process with R (see IFNA_isoforms.r)