
import time
import json
import glob
import os



def merge_jsonl_files(input_pattern, output_file):
    """
    合并多个jsonl文件到一个文件
    
    Args:
        input_pattern: 输入文件的匹配模式，如 "data/*.jsonl"
        output_file: 输出文件路径
    """
    # 确保输出目录存在
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # 获取所有匹配的文件
    input_files = glob.glob(input_pattern)
    print(f"Found {len(input_files)} files to merge")
    
    # 写入合并后的文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for input_file in input_files:
            print(f"Processing {input_file}")
            try:
                with open(input_file, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        # 验证每行是有效的JSON
                        try:
                            json.loads(line.strip())
                            outfile.write(line)
                        except json.JSONDecodeError:
                            print(f"Skipping invalid JSON line in {input_file}")
            except Exception as e:
                print(f"Error processing {input_file}: {str(e)}")



# read a .json file and return the data
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)
    
    
    
def write_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)

def break_into_classes(data):
    cwe_classes = {}
    for item in data:
        if item['CWE ID'] not in cwe_classes:
            cwe_classes[item['CWE ID']] = []
        cwe_classes[item['CWE ID']].append(item)
    for cwe_id, cwe_data in cwe_classes.items():
        file_path = f'./analysis/dataset/MSR_classes/MSR_data_cleaned_vul_classes_{cwe_id}.json'
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(cwe_data, f, ensure_ascii=False)
    return cwe_classes


if __name__ == '__main__':
    # # 使用示例
    
    time_start = time.time()
    file_path = './analysis/dataset/MSR_data_cleaned_vul.json'
    data = read_json(file_path)
    time_end = time.time()
    print(f'Time taken: {time_end - time_start} seconds')
    # dict_keys(['', 'Access Gained', 'Attack Origin', 'Authentication Required', 
    # 'Availability', 'CVE ID', 'CVE Page', 'CWE ID', 'Complexity', 'Confidentiality', 
    # 'Integrity', 'Known Exploits', 'Publish Date', 'Score', 'Summary', 
    # 'Update Date', 'Vulnerability Classification', 'add_lines', 'codeLink',
    # 'commit_id', 'commit_message', 'del_lines', 'file_name', 'files_changed', 
    # 'func_after', 'func_before', 'lang', 'lines_after', 'lines_before',
    # 'parentID', 'patch', 'project', 'project_after', 'project_before', 'vul', 'vul_func_with_fix'])
    
    data_dict = break_into_classes(data)
    import pdb; pdb.set_trace()

    
    
    


