import random
import json
import pandas as pd
import ast
import re
import os
start = 0
end = 10000
conversations = []
repo_ids = []
id2score = {}
with open('./smith_output.jsonl', "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        record = json.loads(line)
        repo_id = record.get("repo_id", "")
        dialog = record.get("dialog", "")
        score = record.get("score", '')
        if repo_ids.count(repo_id) > 0 and id2score[repo_id] < score:
            repo_ids.append(repo_id)
            id2score[repo_id] = score
        elif score > 0 and repo_id not in repo_ids:
            repo_ids.append(repo_id)
            id2score[repo_id] = score
def get_module_from_file(filepath: str, symbol: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        source_lines = f.readlines()
        source_text = ''.join(source_lines)
    try:
        tree = ast.parse(source_text, filename=filepath)
    except SyntaxError as e:
        print(f"Syntax error in file {filepath}: {e}")
        return None
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)) and node.name == symbol:
            start = node.lineno - 1
            end = getattr(node, 'end_lineno', None)
            if end is None:
                return source_lines[start]
            return ''.join(source_lines[start:end])
    print(f"Symbol '{symbol}' not found in file {filepath}.")
    return None
def generate_data(total_num_dataset, split, json_file):
    root = '.'
    rl_dataset = {
        "prompt": [],
        "data_source": [],
        "ability": [],
        "reward_model": [],
        "extra_info": [],
        "agent_name": [],
    }
    json_dicts = []
    count = 0
    with open(json_file, 'r') as f:
        json_dicts = [eval(line.strip()) for line in f.readlines()]
    for idx in range(len(json_dicts)):
        if not (idx >= start and idx <= end):
            continue
        json_dict = json_dicts[idx]
        num_tool_calls = 1
        repo_id = json_dict['repo_id']
        if repo_id not in repo_ids:
            continue
        issue = json_dict['issue']
        ground_truths = []
        for subdict in json_dict['code_sections']:
            file_path = subdict['file_path']
            func_name = subdict['func_name']
            if file_path == "" or func_name == "" or file_path == None or func_name == None:
                continue
            ground_truths.append(f'{file_path}::{func_name}')
        if ground_truths == []:
            continue
        count += 1
        ground_truth = ','.join(ground_truths)
        entry_file = json_dict['entry_file']
        entry_class = json_dict['entry_class']
        entry_function = json_dict['entry_function']
        entry_path = os.path.join('./repos_smith_gold', repo_id, entry_file)
        entry_content = get_module_from_file(entry_path, entry_function)
        prompt = (
            f"You are given a codebase and an issue, you need to locate the files and functions causing this issue."
            f"You can call the tool to check the definition code of a symbol. You can only check the symbol once for each turn, please start from the most possible symbol.\n"
            f"The 'file_path' is the relevant path of where the symbol is called, NOT where it is defined!\n"
            f"\n\nFor instance, if 'classA.functionB' is what you want to check (which is called in fileA.py), you should directly check 'functionB' in 'fileA.py'.\n\n"
            f"This is the issue:\n{issue}\n\n"
            f"The entry file of the code base is:\n{entry_file}:\n"
            f"{entry_content}\n\n"
            f"Your final answer should be all functions that should be modified, with the format of: relevant/path/to/file1.py::func_name1,relevant/path/to/file2.py::func_name2,...(a series of file::function seperated by comma)\n Please put your final answer inside \\boxed{{}} only in the last turn.\n You can only call the tool once each turn.\n"
            f"For instance:\n{{'name': 'check', 'arguments': {{'symbol': 'symbol_to_be_checked', 'file_path': 'file_where_the_symbol_is_used'}}}}"            f"-+-+-+-+-+-+-+-+-+-+"
            f"{repo_id}"
        )
        prompt_with_template = [
            {
                "role": "user",
                "content": prompt,
            }
        ]
        rl_dataset["prompt"].append(prompt_with_template)
        rl_dataset["data_source"].append("codebase_hit")
        rl_dataset["ability"].append("codebase expert")
        rl_dataset["reward_model"].append({"style": "codebase_hit", "ground_truth": ground_truth})
        rl_dataset["extra_info"].append(
            {"index": idx, "issue": issue, "split": split, "expected_tool_calls": num_tool_calls, "repo_id": repo_id}
        )
        rl_dataset["agent_name"].append("codebase_path")
        print("Repo id:", repo_id)
        print(ground_truth)
    print("count:", count)
    rl_dataset = pd.DataFrame(data=rl_dataset)
    return rl_dataset
if __name__ == "__main__":
    path = './issues_data_swe_smith.jsonl'
    train_dataset = generate_data(total_num_dataset=500, split="test", json_file=path)
    os.makedirs('data/swe_smith_func', exist_ok = True)
    def clean_unicode_obj(obj):
        if isinstance(obj, str):
            return obj.encode("utf-8", "surrogatepass").decode("utf-8", "replace")
        elif isinstance(obj, list):
            return [clean_unicode_obj(x) for x in obj]
        elif isinstance(obj, dict):
            return {clean_unicode_obj(k): clean_unicode_obj(v) for k, v in obj.items()}
        else:
            return obj
    for col in train_dataset.columns:
        if train_dataset[col].dtype == "object":  
            train_dataset[col] = train_dataset[col].map(clean_unicode_obj)
    train_dataset.to_parquet("./recipe/langgraph_agent/example/smith/filtered_dense_{}_{}.parquet".format(start, end))