import json
from collections import defaultdict
from pathlib import Path


def build_word_graph(folder_path, output_file):
    edge_info = defaultdict(lambda: {"count": 0, "sentences": []})
    
    folder = Path(folder_path)
    jsonl_files = list(folder.glob("*.jsonl"))
        
    total_pairs = 0
    for file_path in jsonl_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    data = json.loads(line)
                    if 'extraction' in data and 'pairs' in data['extraction']:
                        text = data.get('text_en', '')
                        pairs = data['extraction']['pairs']
                        for pair in pairs:
                            if len(pair) == 2:
                                word1 = pair[0].lower().strip()
                                word2 = pair[1].lower().strip()
                                if word1 and word2 and word1 != word2:
                                    edge_key = tuple(sorted([word1, word2]))
                                    edge_info[edge_key]["count"] += 1
                                    if text and text not in edge_info[edge_key]["sentences"]:
                                        edge_info[edge_key]["sentences"].append(text)
                                    total_pairs += 1
                except json.JSONDecodeError:
                    continue
    
    
    all_words = set()
    for word1, word2 in edge_info.keys():
        all_words.add(word1)
        all_words.add(word2)
    
    word_list = sorted(all_words)
    word_to_id = {word: idx for idx, word in enumerate(word_list)}
    
    
    result = {}
    for word in word_list:
        node_id = word_to_id[word]
        result[node_id] = {
            "word": word,
            "adjacency": {}
        }
    
    for (word1, word2), info in edge_info.items():
        id1 = word_to_id[word1]
        id2 = word_to_id[word2]
        
        result[id1]["adjacency"][word2] = {
            "id": id2,
            "weight": info["count"],
            "sentences": info["sentences"]
        }
        result[id2]["adjacency"][word1] = {
            "id": id1,
            "weight": info["count"],
            "sentences": info["sentences"]
        }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)
        
    return result


if __name__ == "__main__":
    input_folder = "/XXXX/utlis/metaphor_pairs_translate"
    output_file = "/XXXX/utlis/metaphor_graph.json"
    
    graph = build_word_graph(input_folder, output_file)
    
    for node_id in list(graph.keys())[:2]:
        print(f"\n{node_id}:")
        print(f"  word: {graph[node_id]['word']}")
        print(f"  adjacency:")
        for neighbor, info in list(graph[node_id]["adjacency"].items())[:2]:
            print(f"    {neighbor}:")
            print(f"      id: {info['id']}")
            print(f"      weight: {info['weight']}")
            print(f"      sentences: {info['sentences'][0]}" if info['sentences'] else "      sentences: []")
