import os
import json
from tqdm import tqdm
from retriv import SearchEngine
import argparse
import shutil
from pathlib import Path


def dump_jsonl(corpus_path, max_num=None):
    data_path = corpus_path
    output_path = os.path.join(os.path.dirname(corpus_path), 'psgs_w100.jsonl')
    with open(data_path, 'r') as fin:
        lines = fin.readlines()
        metadata = lines.pop(0).strip().split("\t")
        data = []
        for line in tqdm(lines[:max_num]):
            items = line.strip().split("\t")
            idx, passage, title = items
            data.append({
                "id": int(idx),
                "title": title,
                "text": passage
            })

    with open(output_path, 'w') as fout:
        for i in tqdm(data[:max_num]):
            fout.write(json.dumps(i) + "\n")
    return output_path


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--num_chunk', type=int, default=1)
    ap.add_argument('--merge_title_to_content', type=str2bool, default='False')
    ap.add_argument('--corpus_path', type=str)
    ap.add_argument('--target_path', type=str)
    args = ap.parse_args()

    n = args.num_chunk
    if args.merge_title_to_content:
        index_name = f"wiki_tit_body_w{n}00"
    else:
        index_name = f"wiki_body_w{n}00"

    dsc = args.target_path

    args.corpus_path = dump_jsonl(args.corpus_path)

    if os.path.exists(dsc):
        print(f'search engine named {index_name} already exists!')
    else:
        if args.merge_title_to_content:
            SearchEngine.delete(index_name)
            se = SearchEngine(index_name)
            se.index_file(path=args.corpus_path, show_progress=True,
                          callback=lambda doc: {
                                "id": doc["id"],
                                "text": doc["title"] + "\n" + doc["text"]
                          })
        else:
            SearchEngine.delete(index_name)
            se = SearchEngine(index_name)
            se.index_file(path=args.corpus_path, show_progress=True,
                          callback=lambda doc: {
                              "id": doc["id"],
                              "text": doc["text"],
                              "title": doc["title"]
                          })

        print(f"search engine of index_name: {index_name} is prepared..")

        src = f"{Path.home()}/.retriv/collections/{index_name}"

    print(f'specify --search_engine={index_name} in running search_server.py')