import os 
import json
from tqdm import tqdm 
import argparse
import random 

def reference_data_statistics():
    path = "livecodebench/code_generation_lite/reference_datasets.jsonl"
    
    datasets = []
    with open(path, 'r') as f:
        for line in f:
            datasets.append(json.loads(line))
            
    print(f"length of datasets = {len(datasets)}")
    
    content_tokens = []
    for item in datasets:
        quesstion_content = item["question_content"] 
        content_token_count = len(quesstion_content.split())
        content_tokens.append(content_token_count)
    
    print(f"max token = {max(content_tokens)}")
    print(f"min token = {min(content_tokens)}")
    print(f"avg token = {sum(content_tokens)/len(content_tokens)}")

def synthetic_data_count_statistics(synthetic_dir, synthetic_instruction_file):
    """
    统计过滤过程中流程下来的数量
    """
    # synthetic_instruction_file = 'vllm_generated_80000_samples_temp1.0.jsonl'
    
    synthetic_instruction_path = os.path.join(synthetic_dir, synthetic_instruction_file)
    synthetic_instruction_filtered_path = os.path.join(synthetic_dir, synthetic_instruction_file.replace('.jsonl', '_instruct_quality_response.jsonl'))
    instruction_response_filtered_path = os.path.join(synthetic_dir, synthetic_instruction_file.replace('.jsonl', '_instruct_quality_response_quality_filtered.jsonl'))
    
    with open(synthetic_instruction_path, 'r') as f:
            synthetic_instruciton_dataset = [json.loads(line) for line in f]
    
    with open(synthetic_instruction_filtered_path, 'r') as f:
        synthetic_instruction_filtered_dataset = [json.loads(line) for line in f]
        
    with open(instruction_response_filtered_path, 'r') as f:
        instruction_response_dataset = [json.loads(line) for line in f]
        
    print(f"Synthetic instruction count = {len(synthetic_instruciton_dataset)}")
    print(f"Synthetic instruction filtered count = {len(synthetic_instruction_filtered_dataset)} || ration = {round(len(synthetic_instruction_filtered_dataset)/len(synthetic_instruciton_dataset), 4)}")
    print(f"instruction_response_dataset count = {len(instruction_response_dataset)} || ration = {round(len(instruction_response_dataset)/len(synthetic_instruciton_dataset), 4)}")

def train_val_datasets():
    path = "livecodebench/code_generation_lite/reference_datasets.jsonl"
    
    reference_datasets = []
    with open(path, 'r') as f:
        for line in f:
            reference_datasets.append(json.loads(line))
    
    random.seed(42)
    random.shuffle(reference_datasets)
    
    split_index = int(len(reference_datasets) * 0.9)
    
    reference_train = reference_datasets[:split_index]
    reference_valid = reference_datasets[split_index:]
    
    reference_train_path = "livecodebench/code_generation_lite/reference_train.jsonl"
    with open(reference_train_path, 'w', encoding='utf-8') as f:
        for item in reference_train:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
            
    reference_valid_path = "livecodebench/code_generation_lite/reference_valid.jsonl"
    with open(reference_valid_path, 'w', encoding='utf-8') as g:
        for item in reference_valid:
            json.dump(item, g, ensure_ascii=False)
            g.write('\n')          
    
def magpie_for_filter(path):
    with open(path, 'r') as f:
        data = json.load(f)
        
    print(f"len = {len(data)}")
    new_datasets = []
    for item in data:
        new_item = {
            'synthetic_text': item['instruction'],
            'original_response': item['response']
        }
        
        new_datasets.append(new_item)
        
    with open("magpie/code-gen/code_gen_domain_Qwen2.5-Coder-7B-Instruct_topp1_temp1_202504121808/Magpie_Qwen2.5-Coder-7B-Instruct_80000_202504121808_ins_res_format.jsonl", 'w') as g:
        for item in new_datasets:
            g.write(json.dumps(item) + '\n')
  

if __name__ == "__main__":
    
    # reference_data_statistics()
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--synthetic_dir", type=str, default='xxx')
    parser.add_argument("--synthetic_instruction_file", type=str, default='xxx')
    args = parser.parse_args()
    
    # synthetic_data_count_statistics(
    #     synthetic_dir=args.synthetic_dir,
    #     synthetic_instruction_file=args.synthetic_instruction_file)
    
    # train_val_datasets()
    
    path = "magpie/code-gen/code_gen_domain_Qwen2.5-Coder-7B-Instruct_topp1_temp1_202504121808/Magpie_Qwen2.5-Coder-7B-Instruct_80000_202504121808_ins_res.json"
    magpie_for_filter(path)