import argparse
import pathlib
from typing import List

import numpy as np
import pandas as pd

from aligners import MMseqs


parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputpath',
                    help='Path to input fasta',
                    metavar='I',
                    type=pathlib.Path,
                    required=True,
                    default="data/deepfri_ec/sequences/deepfri_ec_seqs.fasta")
parser.add_argument('-o', '--outputpath',
                    help='Path to output folder.',
                    metavar='O',
                    type=pathlib.Path,
                    required=True,
                    default="data/deepfri_ec/mmseqs")
parser.add_argument('-s',
                    type=float,
                    required=False,
                    default=7.5,
                    choices=np.arange(0,10,0.1),
                    help="Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive.")
parser.add_argument('-c',
                    type=float,
                    required=False,
                    default=0.8, 
                    choices=np.arange(0,1.1,0.1),
                    help="List matches above this fraction of aligned (covered) residues.")
parser.add_argument('--cov_mode',
                    type=int,
                    required=False,
                    default=1,
                    choices=[0,1,2,3,4,5],
                    help="1: coverage of target.")
parser.add_argument('--alignment_mode',
                    type=int,
                    required=False,
                    default=3,
                    choices=[0,1,2,3],
                    help="0:automatic; 3: Outputs alignment start, end, and seq_id score.")
parser.add_argument('--alignment_output_mode',
                    type=int,
                    required=False,
                    default=0,
                    help="Has to be set to 0 = automatic. MMseqs does not work with any other value.")
parser.add_argument('--alignment_outputs',
                    type=List[str],
                    required=False,
                    default=['query','target','evalue','fident','nident','qstart','qend','qlen','tstart','tend','tlen','alnlen',
                             'cigar','qseq','qaln','taln','tseq','qcov','tcov'],
                    help="Metrics outputted by the alignment module.")
parser.add_argument('-e',
                    type=float,
                    required=False,
                    default=1.000E-03,
                    help="List matches below this E-value. The lower, the more stringent the comparisons.")
parser.add_argument('--min_seq_id',
                    type=float,
                    required=False,
                    choices=np.arange(0,1,0.01),
                    )
parser.add_argument('--seq_id_mode',
                    type=int,
                    required=False,
                    default=0, 
                    help="Which sequence the seq_id score is normalized with - 0: alignment, 1: shorter seq; 2: longer seq.")
parser.add_argument('--cluster_mode',
                    type=int,
                    required=False,
                    default=1,
                    help="0: greedy, 1: connected components, 2-3: CDHIT.")
parser.add_argument('--create_db',
                    action='store_true',
                    help="Whether to create a sequence database.")
parser.add_argument('--prefilter',
                    action='store_true',
                    help="Whether to do the prefilter k-mer match stage.")
parser.add_argument('--align',
                    action='store_true',
                    help="Whether to align the prefiltered sequences.")
parser.add_argument('--cluster',
                    action='store_true',
                    help="Whether to cluster the sequence database.")
parser.add_argument('--threads',
                    type=int,
                    required=False,
                    default=64,
                    help="Number of CPU-cores used.")
parser.add_argument('-v',
                    type=int,
                    required=False,
                    default=3,
                    choices=[0,1,2,3],
                    help="Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info.")

def main(args):
    # Check input parameters
    assert args.alignment_output_mode == 0, 'Step mmseqs align does not work with alignment_output_mode set to any other value than 0.'
    mmseqs = MMseqs(args)
    mmseqs.make_output_dirs()

    if args.create_db:
        mmseqs.create_db()

    if args.prefilter:
        # Calculate prefiltering scores for k-mer matches
        # options to vary: --max-seqs, -c, --cov-mode
        mmseqs.prefilter()

    if args.align:
        # Calculate alignment for sequences passing prefiltering > sequence identity, alignment coverage, e-value
        # options to vary: --alignment-mode, -e, --min-seq-id, --seq-id-mode, -c, --cov-mode
        mmseqs.align()

    if args.cluster:
        # Cluster sequences using alignment data
        # options to vary: --cluster-mode
        mmseqs.cluster()


if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
    main(args)










