#DNA FASTAファイルから最長のORFを抽出し5UTR,CDS,3UTRに分割して保存する。 import sys, os, re from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq fasta_file = sys.argv[1] faname = os.path.basename(fasta_file) fdir = os.path.dirname(fasta_file) fname = os.path.splitext(faname)[0] fext = os.path.splitext(fasta_file)[1] utr_5_fasta = os.path.splitext(fasta_file)[0] + '_5utr' + fext cds_fasta = os.path.splitext(fasta_file)[0] + '_cds' + fext utr_3_fasta = os.path.splitext(fasta_file)[0] + '_3utr' + fext for record in SeqIO.parse(fasta_file, 'fasta'): cds = re.findall('(ATG(?:\S{3})*?T(?:AG|AA|GA))', str(record.seq).upper()) if cds: match = max(cds, key = len, , default=0) seq = Seq(match, IUPAC.ambiguous_dna) utr_5 = re.sub(str(seq.strip()+'[ACGT]*'), '', str(record.seq).upper()) utr_3 = re.sub(str(utr_5 + seq.strip()), '', str(record.seq).upper()) with open(utr_5_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(utr_5) + '\n') with open(cds_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(seq) + '\n') with open(utr_3_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(utr_3) + '\n') else: t_cds = re.findall('(ATG[ACGT]*)', str(record.seq).upper()) if t_cds: match = max(t_cds, key = len) seq = Seq(match, IUPAC.ambiguous_dna) utr_5 = re.sub(str(seq.strip()+'[ACGT]*'), '', str(record.seq).upper()) utr_3 = re.sub(str(utr_5 + seq.strip()), '', str(record.seq).upper()) with open(utr_5_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(utr_5) + '\n') with open(cds_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(seq) + '\n') with open(utr_3_fasta, 'a') as f: f.write(">" + record.id +'\n') f.write(str(utr_3) + '\n')
CDSがstopまで入っていないとき5UTRも切り出せていなかったところを改善してみた。