from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
import argparse
from tqdm import tqdm

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CSV To Fasta')
    parser.add_argument('--csv_file', default="datasets/split100.csv", type=str, help='csv file contains protein sequences')
    parser.add_argument('--fasta_file', default="datasets/split100.fasta", type=str, help='fasta file to store proteins')
    
    args = parser.parse_args()
    
    data = pd.read_csv(args.csv_file, sep='\t')
    
    
    with open(args.fasta_file, 'w') as fout:
        for i in tqdm(range(len(data))):
            data_frame = data.iloc[i]
            record_id = data_frame['Entry']
            record_description = data_frame['EC number']
            record = SeqRecord(
                seq = data_frame['Sequence'], id=record_id, description=record_description
            )
            SeqIO.write(record, fout, 'fasta')