A. Treatment of Sequences
No. 1
#Extract FASTA Sequences from another file using sequence IDs in another file
#Write two separate scripts with file names FastaToTbl and TblToFasta with the following content:
FastaToTbl:
{
if (substr($1,1,1)==">")
if (NR>1)
printf "\n%s\t", substr($0,2,length($0)-1)
else
printf "%s\t", substr($0,2,length($0)-1)
else
printf "%s", $0
}END{printf "\n"}
TblToFasta:
#! /usr/bin/awk -f
sequence=$NF
ls = length(sequence)
is = 1
fld = 1
while (fld < NF)
{
if (fld == 1){printf ">"}
printf "%s " , $fld
if (fld == NF-1)
{
printf "\n"
}
fld = fld+1
}
while (is <= ls)
{
printf "%s\n", substr(sequence,is,60)
is=is+60
}
}
#Further explainations check the reference below.
No. 2.
#Convert Multi FASTA line to a single FASTA line
$ cat input_file.fasta | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' > singleline.fasta
OR used awk program:
% awk '!/^>/ { printf "%s", $0; n = "\n" }
/^>/ { print n $0; n = "" }
END { printf "%s", n }
' input_file.fasta > singleline.fasta
Ref: https://stackoverflow.com/questions/15857088/remove-line-breaks-in-a-fasta-file
No. 3
# Randommly select sets of FASTA sequences Using python script:
from Bio import SeqIO
from random import sample
with open(r"/Users/User pc/Desktop/pythonEx/viralNinput.txt") as f:
seqs = SeqIO.parse(f, "fasta")
samps = ((seq.name, seq.seq) for seq in sample(list(seqs),10000))
for samp in samps:
v.write(''.join(">{}\n{}".format(*samp)+'\n'))
v.write("\n")
v.write("\n\n")
Comments
Post a Comment