import json
import re
from utils import read_json, read_jsonl, save_jsonl
import sys

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--json-path", type=str, required=True)
parser.add_argument("--output-path", type=str, required=True)
args = parser.parse_args()

def load_songci_dic():
    return read_json("")

def process(line_list):
    notes = "[，、。？！,：]"
    # notes = ['，', '、', '。', '？', '！', ',', '：']
    poem = []
    changed = False
    for line in line_list:
        if line == "":
            changed = True
            continue
        valid = True
        new_line = line.strip('')
        new_line = new_line.replace('“', '')
        new_line = new_line.replace('”', '')
        new_line = new_line.replace('\"', '')
        new_line = new_line.replace('"', '')
        new_line = new_line.replace('《', '')
        new_line = new_line.replace('》', '')
        if line != new_line:
            changed = True
        for n in notes:
            if n in new_line:
                # print("wrong data:", line_list)
                valid = False
                changed = True
                # res = new_line.split(n)
                res = re.split(notes, new_line)
                for r in res:
                    if len(r) > 0:
                        poem.append(r)
                break
        if valid:
            poem.append(new_line)
    return changed, poem

def lazy_process(line_list):
    notes = ['，', '、', '。', '？', '！', ',', '：', '“', '”', '\"', '"', "《", "》"]
    poem = []
    changed = False
    for line in line_list:
        if line == "":
            changed = True
            continue
        new_line = line
        for n in notes:
            new_line = new_line.strip(n)
        if new_line != line:
            changed = True
        poem.append(new_line)
    return changed, poem

if __name__ == '__main__':
    print("-----------------------")
    print("Poem post gpt4 process.")
    data = read_jsonl(args.json_path)
    songci_dic = load_songci_dic()

    new_data = []
    error_count = 0
    for d in data:
        try:
        #if 1:
            if "conversations" in d:
                prompt = d["conversations"][0]["value"]
            else:
                prompt = d["prompt"][0]["value"]
            content = json.loads(d["gpt_decom"].removeprefix('```json').removesuffix('```'))
            if len(content["诗歌"]) == 0:
                continue
            if d["tags"]["体裁"]["体裁"] in ["绝句", "律诗"]:
                if len(content["标题"]) == len(content["诗歌"][0]) and len(content["诗歌"]) % 2 == 1:
                    print("poetry wrong: ", content)
                    # first_s = content["标题"]
                    # content["标题"] = ""
                    # content["诗歌"] = [first_s] + content["诗歌"]
                elif len(content["标题"]) == len(content["诗歌"][0]) and len(content["诗歌"]) % 2 == 0:
                    if content["标题"] == content["诗歌"][0]:
                        print("poetry warn: ", content)
                        # content["标题"] = ""
            if d["tags"]["体裁"]["体裁"] == "宋词":
                if "词牌名" not in d["tags"]["体裁"]:
                    continue
                # con_list = songci_dic[d["tags"]["词牌名"]]
                # cons = []
                # for x in con_list:
                #     cons.extend(x)
                # if len(content["标题"]) == cons[0]:
                    # if content["标题"] == d["gen"][:len(content["标题"])]:
                    # if len(content["诗歌"]) == len(cons) - 1:
                        # print("error", content)
                        # print(d["gen"][:len(content["标题"])])
                        # first_s = content["标题"]
                        # content["标题"] = ""
                        # content["诗歌"] = [first_s] + content["诗歌"]
            if d["tags"]["体裁"]["体裁"] in ["古体诗", "绝句", "律诗", "宋词"]:
                changed, new_poem = process(content["诗歌"])
                if changed:
                    content["旧诗歌"] = content["诗歌"]
                    content["诗歌"] = new_poem
            else:
                changed, new_poem = lazy_process(content["诗歌"])
                if changed:
                    content["旧诗歌"] = content["诗歌"]
                    content["诗歌"] = new_poem
            d["decom_dic"] = content
            new_data.append(d)
        except:
            error_count += 1
    print("error count", error_count)
    print("left data", len(new_data))

    save_jsonl(new_data, args.output_path)
    
