cDNA FASTAファイルから最長ORFを抽出し、5'UTR/CDS/3'UTRに分割してそれぞれのFASTAファイルを作成する(改訂版）

#DNA FASTAファイルから最長のORFを抽出し５UTR,CDS,３UTRに分割して保存する。

import sys, os, re
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq

fasta_file = sys.argv[1]
faname = os.path.basename(fasta_file)
fdir = os.path.dirname(fasta_file)
fname = os.path.splitext(faname)[0]
fext = os.path.splitext(fasta_file)[1]
utr_5_fasta = os.path.splitext(fasta_file)[0] + '_5utr' + fext
cds_fasta = os.path.splitext(fasta_file)[0] + '_cds' + fext
utr_3_fasta = os.path.splitext(fasta_file)[0] + '_3utr' + fext
for record in SeqIO.parse(fasta_file, 'fasta'):
    cds = re.findall('(ATG(?:\S{3})*?T(?:AG|AA|GA))', str(record.seq).upper())

    if cds:
        match = max(cds, key = len, , default=0)
        seq = Seq(match, IUPAC.ambiguous_dna)
        utr_5 = re.sub(str(seq.strip()+'[ACGT]*'), '', str(record.seq).upper())
        utr_3 = re.sub(str(utr_5 + seq.strip()), '', str(record.seq).upper())
        with open(utr_5_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(utr_5) + '\n')
        with open(cds_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(seq) + '\n')
        with open(utr_3_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(utr_3) + '\n')
    else:
        t_cds = re.findall('(ATG[ACGT]*)', str(record.seq).upper())
        if t_cds:
            match = max(t_cds, key = len)
            seq = Seq(match, IUPAC.ambiguous_dna)
            utr_5 = re.sub(str(seq.strip()+'[ACGT]*'), '', str(record.seq).upper())
            utr_3 = re.sub(str(utr_5 + seq.strip()), '', str(record.seq).upper())
            with open(utr_5_fasta, 'a') as f:
                f.write(">" + record.id +'\n')
                f.write(str(utr_5) + '\n')
            with open(cds_fasta, 'a') as f:
                f.write(">" + record.id +'\n')
                f.write(str(seq) + '\n')
            with open(utr_3_fasta, 'a') as f:
                f.write(">" + record.id +'\n')
                f.write(str(utr_3) + '\n')

CDSがstopまで入っていないとき５UTRも切り出せていなかったところを改善してみた。

kuroの覚え書き

96の個人的覚え書き

cDNA FASTAファイルから最長ORFを抽出し、5'UTR/CDS/3'UTRに分割してそれぞれのFASTAファイルを作成する(改訂版）