cDNA FASTAファイルから最長ORFを抽出し、5'UTR/CDS/3'UTRに分割してそれぞれのFASTAファイルを作成する

cDNA FASTAファイルから最長のORFを抽出し、５UTR,CDS,３UTRに分割して保存する。
Multi FASTA にも対応する。

#fasta_utr.py

import sys, os, re
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq

fasta_file = sys.argv[1]
faname = os.path.basename(fasta_file)
fdir = os.path.dirname(fasta_file)
fname = os.path.splitext(faname)[0]
fext = os.path.splitext(fasta_file)[1]
utr_5_fasta = os.path.splitext(fasta_file)[0] + '_5utr' + fext
cds_fasta = os.path.splitext(fasta_file)[0] + '_cds' + fext
utr_3_fasta = os.path.splitext(fasta_file)[0] + '_3utr' + fext
for record in SeqIO.parse(fasta_file, 'fasta'):
    match = max(re.findall('(ATG(?:\S{3})*?T(?:AG|AA|GA))', str(record.seq)), key = len)
    if match:
        seq = Seq(match, IUPAC.ambiguous_dna)
        utr_5 = re.sub(str(seq.strip()+'[ACGT]*'), '', str(record.seq))
        utr_3 = re.sub(str(utr_5 + seq.strip()), '', str(record.seq))
        with open(utr_5_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(utr_5) + '\n')
        with open(cds_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(seq) + '\n')
        with open(utr_3_fasta, 'a') as f:
            f.write(">" + record.id +'\n')
            f.write(str(utr_3) + '\n')

使い方はかんたん。処理したいFASTAファイルを引数にpythonを実行する。元のファイルと同じディレクトリ階層に_5utr, _cds, _3utrと名前の増えたファイルが生成される。