import os
import json
import re
from enum import Enum, auto
from typing import Optional, List, Dict, Any
from collections import defaultdict
from tree_sitter_language_pack import Language, get_parser, get_language
import argparse
import subprocess
import uuid

import pandas as pd
from tqdm import tqdm

from .location import CodeLocation, ComponentType
from .language_specify import LanguageManager, LANGUAGE_TO_EXTENSIONS

def merge_intervals(intervals):
    # intervals inclusive
    if not intervals:
        return []

    # Sort the intervals based on the starting value of each tuple
    intervals.sort(key=lambda interval: interval[0])

    merged_intervals = [intervals[0]]

    for current in intervals[1:]:
        last = merged_intervals[-1]

        # Check if there is overlap
        if current[0] <= last[1]:
            # If there is overlap, merge the intervals
            merged_intervals[-1] = (last[0], max(last[1], current[1]))
        else:
            # If there is no overlap, just add the current interval to the result list
            merged_intervals.append(current)

    return merged_intervals

class CodeStructure:
    """
    Analyze一个itemfiles夹, Generatecodestructuretree, 并提供基于CodeLocation的查询功能.
    """
    NODE_TYPE_KEY = "/type/"
    NODE_TYPE_DIR = "directory"
    NODE_TYPE_FILE = "file"
    LANGUAGE_TO_EXTENSIONS = {lang.value: extensions for lang, extensions in LANGUAGE_TO_EXTENSIONS.items()}

    def __init__(self, directory_path: str, language: str):
        """
        [修正后] 构造function现在会一次性Initialize并storage parser and language object.
        """
        if language not in self.LANGUAGE_TO_EXTENSIONS.keys():
            raise ValueError(f"不支持的语言: '{language}'")
        
        self.directory_path = os.path.abspath(directory_path)
        self.language_manager = LanguageManager(language)
        
        self.parser = get_parser(language)
        self.lang = get_language(language)
        
        self.tree: Dict[str, Any] = self._parse_directory()

    def save(self, filepath: str, **kwargs):
        """
        将完整的codestructure（包括元data）Save到JSONfiles.

        Args:
            filepath (str): 要保存到的文件path.
        """
        data_to_save = {
            "metadata": {
                "directory_path": self.directory_path,
                "language": self.language_manager.language.value,
                **kwargs
            },
            "structure_tree": self.tree
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data_to_save, f, indent=4, ensure_ascii=False)

    @classmethod
    def load(cls, filepath: str) -> 'CodeStructure':
        """
        fromJSONfilesLoadcodestructure, 重建一个CodeStructureinstance.

        Args:
            filepath (str): 要从中加载的文件path.

        Returns:
            CodeStructure: 一个与Save时status相同的新实例.
        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"结构文件不存在: {filepath}")

        with open(filepath, 'r', encoding='utf-8') as f:
            saved_data = json.load(f)
            
        metadata = saved_data.get("metadata")
        structure_tree = saved_data.get("structure_tree")
        
        if not metadata or not structure_tree:
            raise ValueError("JSON文件格式无效或不完整.缺少 'metadata' 或 'structure_tree'.")

        # create一个emptyinstance（useLoadmode以skippedParse）
        instance = cls(
            directory_path=metadata.get("directory_path"), 
            language=metadata.get("language"), 
        )
        
#
        instance.directory_path = metadata.get("directory_path")
        instance.language_manager = LanguageManager(metadata.get("language"))
        instance.tree = structure_tree
        
        return instance
    

    def filter_test_files(self):
        """
        Filters out all test files and directories from the structure.
        This removes any file or directory whose name starts with 'test'.
        It then iteratively removes any directories that become empty as a result.
        """
        def should_delete(name: str, content: Dict[str, Any]) -> bool:
            return self.language_manager.is_test_path(name)
        
        self._filter_and_prune(self.tree, should_delete)

    def filter_none_code(self):
        """
        Filters out files that could not be parsed correctly by tree-sitter.
        This is interpreted as removing any file node that contains an 'error' key,
        which indicates a parsing failure. It then iteratively removes any
        directories that become empty as a result.
        """
        def should_delete(name: str, content: Dict[str, Any]) -> bool:
            is_file = isinstance(content, dict) and content.get(self.NODE_TYPE_KEY) == self.NODE_TYPE_FILE
            has_error = 'error' in content if isinstance(content, dict) else False
            return is_file and has_error

        self._filter_and_prune(self.tree, should_delete)

    def _filter_and_prune(self, current_node: Dict[str, Any], should_delete_item) -> bool:
        """
        Recursively filters the structure tree based on a condition and prunes empty directories.

        Args:
            current_node: The current node (directory) in the tree to process.
            should_delete_item: A function that takes (name, content) and returns True if the item should be deleted.

        Returns:
            True if the current_node itself has become empty and should be pruned by its caller.
        """
        keys_to_delete = []
        # First, check children and recurse into subdirectories
        for name, content in current_node.items():
            if name == self.NODE_TYPE_KEY:
                continue

            # Condition 1: The item itself matches the deletion criteria
            if should_delete_item(name, content):
                keys_to_delete.append(name)
                continue

            # Condition 2: Recurse into subdirectories
            if isinstance(content, dict) and content.get(self.NODE_TYPE_KEY) == self.NODE_TYPE_DIR:
                # If the subdirectory becomes empty after filtering, mark it for deletion
                if self._filter_and_prune(content, should_delete_item):
                    keys_to_delete.append(name)
        
        # Delete the marked items from the current node
        for key in keys_to_delete:
            if key in current_node:
                del current_node[key]

        # Finally, determine if the current node is now empty (only contains the type key)
        # and should be pruned by its parent.
        return len(current_node) <= 1
    

    def _parse_directory(self) -> Dict[str, Any]:
        """
        [重构后] 遍历directory, 并Build一个反映files系统层级的嵌套dict.
        """
        root_structure = {}
        supported_extensions = self.language_manager.code_suffix_set()

        for root, _, files in os.walk(self.directory_path, topdown=True):
            for file in files:
                if not file.endswith(supported_extensions):
                    continue
                
                path = os.path.join(root, file)
                relative_path = os.path.relpath(path, self.directory_path).replace(os.path.sep, '/')
                
                path_parts = relative_path.split('/')
                current_level = root_structure
                
                # 遍历path的directory部分
                for part in path_parts[:-1]:
                    # 如果directorynode不exists, 则create它
                    if part not in current_level:
                        current_level[part] = {self.NODE_TYPE_KEY: self.NODE_TYPE_DIR}
#
                    current_level = current_level[part]
                
                # 在correct的positionsettingsfilesnode
                file_name = path_parts[-1]
                # TODO: try
                # try:
                file_content = self._parse_file(path)
                current_level[file_name] = {
                    self.NODE_TYPE_KEY: self.NODE_TYPE_FILE,
                    **file_content # 将filesAnalyzeresultMerge进来
                }
                # except Exception as e:
                #     print(f"ParsefilesFailure '{relative_path}': {e}")
                #     current_level[file_name] = {
                #         self.NODE_TYPE_KEY: self.NODE_TYPE_FILE,
                #         "error": str(e)
                #     }
                        
        return root_structure
    

    def _get_globals(self, all_lines: List[str], defined_blocks: List[Dict]) -> List[Dict]:
        """识别并Extract全局code块（在function/class定义between的代码）."""
        globals_list = []
        last_block_end = 0
        
        # grouped by起始rowSort
        sorted_blocks = sorted(defined_blocks, key=lambda x: x['start_line'])
        
        for block in sorted_blocks:
            # Checkcurrent块与previous块between的间隙
            gap_start = last_block_end + 1
            gap_end = block['start_line'] - 1
            if gap_start <= gap_end:
                globals_list.append({
                    "name": f"global_block_{gap_start}-{gap_end}",
                    "start_line": gap_start,
                    "end_line": gap_end,
                })
            last_block_end = block['end_line']
            
        # Checklast块到files末尾的间隙
        gap_start = last_block_end + 1
        gap_end = len(all_lines)
        if gap_start <= gap_end:
             globals_list.append({
                "name": f"global_top_level_{gap_start}-{gap_end}",
                "start_line": gap_start,
                "end_line": gap_end,
            })
             
        return globals_list

    def _get_node_metadata(self, outer_node, name_source_node, text_bytes: bytes) -> Dict[str, Any]:
        """
        from外部nodeGet范围, 从内部节点获取name.
        """
        start_line = outer_node.start_point[0] + 1
        end_line = outer_node.end_point[0] + 1
        
        # 首先尝试standard的namefield
        name_node = name_source_node.child_by_field_name('name')
        if name_node:
            name = name_node.text.decode('utf8')
        else:
            # 对于C/C++语言的特殊Process
            name = self._extract_name_for_language(name_source_node)
        
        return {"name": name, "start_line": start_line, "end_line": end_line}
    
    def _extract_name_for_language(self, node) -> str:
        """
        根据不同语言andnodeclass型Extractname的特殊Processmethod.
        """
        # C/C++语言的特殊Process
        if self.language_manager.language.value in ['c', 'cpp']:
            if node.type == 'function_definition':
                # function定义:name在declaratorfield的function_declarator中的firstidentifier
                declarator_node = node.child_by_field_name('declarator')
                if declarator_node and declarator_node.type == 'function_declarator':
                    for child in declarator_node.children:
                        if child.type == 'identifier':
                            return child.text.decode('utf8')
            elif node.type in ['struct_specifier', 'union_specifier', 'enum_specifier', 'class_specifier']:
                # structure体/联合体/枚举/class:Findtype_identifier子node
                for child in node.children:
                    if child.type == 'type_identifier':
                        return child.text.decode('utf8')
        
        # 其他语言或未Match的情况, 尝试Findfirstidentifier子node
        for child in node.children:
            if child.type == 'identifier':
                return child.text.decode('utf8')
        
        return 'anonymous'

    # --- 重构后的 _parse_file method ---

    def _find_top_level_ancestor(self, node, anchor_node, forbidden_types):
        """
        [修正后] from`node`Starting向上追溯, 直到found`anchor_node`的直接子node.
        use一个动态的 forbidden_types set来判断edge界.
        """
        current = node
        # path中不能containing的nodeclass型 (现在是动态传入的)
        
        while current.parent:
            if current.parent == anchor_node:
                return current  # found了, current是顶层node
            
# , ,
            if current.parent.type in forbidden_types:
                return None
            
            current = current.parent
        
        return None # 未found锚点

    def _is_direct_child_node(self, node, anchor_node, forbidden_types):
        """
        from`node`Starting向上追溯, 直到found`anchor_node`
        判断node是否为anchor_node的子node, 而且不经过forbidden_types
        """
        current = node
        # path中不能containing的nodeclass型 (现在是动态传入的)
        
        while current.parent:
            if current.parent == anchor_node:
                return True
            
# , ,
            if current.parent.type in forbidden_types:
                return False
            
            current = current.parent
        
        return False

    def _parse_file(self, path: str) -> Dict[str, Any]:
        with open(path, 'rb') as f:
            content_bytes = f.read()
        
        tree = self.parser.parse(content_bytes)
        root_node = tree.root_node
        all_lines = content_bytes.decode('utf-8', errors='ignore').splitlines()
        
        queries = self.language_manager.get_queries()
        boundary_nodes = self.language_manager.get_boundary_nodes() # <--- Getedge界node
        
        toplevel_query_str = queries.get("toplevel_definitions")
        
        if not toplevel_query_str:
            return {"functions": [], "classes": [], "globals": [], "text_lines": all_lines}

        query = self.lang.query(toplevel_query_str)
        all_captures = query.captures(root_node)
        
        functions, classes = [], []
        processed_nodes = set()

        all_class_definitions = all_captures.get('cls', [])
        all_func_definitions = all_captures.get('func', [])
        all_method_definitions = all_captures.get('meth', [])

        # Processclass及其method
        for class_def_node in all_class_definitions:
            if class_def_node in processed_nodes:
                continue

            if self._is_direct_child_node(class_def_node, root_node, boundary_nodes):
                metadata = self._get_node_metadata(class_def_node, class_def_node, content_bytes)
                processed_nodes.add(class_def_node)
                methods = []
                method_query_str = queries.get("class_methods")
                if method_query_str:
                    search_node_for_methods = class_def_node
                    method_query = self.lang.query(method_query_str)
                    method_captures = method_query.captures(search_node_for_methods)
                    
                    for method_def_node in method_captures.get('meth', []):
                        if self._is_direct_child_node(method_def_node, search_node_for_methods, boundary_nodes):
                            method_metadata = self._get_node_metadata(method_def_node, method_def_node, content_bytes)
                            # if method_metadata['name'] != 'anonymous':
                            methods.append(method_metadata)
                            processed_nodes.add(method_def_node)
                
                metadata['methods'] = sorted(methods, key=lambda m: m['start_line'])
                # if metadata['name'] != 'anonymous':
                classes.append(metadata)

        all_func_definitions.extend(all_method_definitions)

        # Process顶层function
        for func_def_node in all_func_definitions:
            if func_def_node in processed_nodes:
                continue
            
            # --- 关键改动:传入 boundary_nodes ---
            if self._is_direct_child_node(func_def_node, root_node, boundary_nodes):
                metadata = self._get_node_metadata(func_def_node, func_def_node, content_bytes)
                # if method_metadata['name'] != 'anonymous':
                functions.append(metadata)
                processed_nodes.add(func_def_node)

        all_defined_blocks = functions + classes
        globals_list = self._get_globals(all_lines, all_defined_blocks)

        return {
            "functions": sorted(functions, key=lambda f: f['start_line']),
            "classes": sorted(classes, key=lambda c: c['start_line']),
            "globals": globals_list,
            "text_lines": all_lines,
        }
    

    def find(self, location: 'CodeLocation') -> Optional[Dict[str, Any]]:
        """
        [重构后] 在嵌套的structuretree中Find并returncodenode.
        """
        path_parts = location.file_path.split('/')
        current_level = self.tree
        
        # 1. 遍历嵌套dict, localization到filesnode
        for part in path_parts:
            if not isinstance(current_level, dict) or part not in current_level:
                return None # path不exists
            current_level = current_level[part]
        
        file_node = current_level
        
        # ensure我们found了一个filesnode
        if not isinstance(file_node, dict) or file_node.get(self.NODE_TYPE_KEY) != self.NODE_TYPE_FILE:
            return None

        # 2. 在filesnode内部Find具体的code成分（这部分逻辑与之前相同）
        target_node = None
        if location.component_type == ComponentType.GLOBAL:
            # 对于全局position, return整个filesnode（不包括class型键）
            target_node = file_node["globals"]

            # special handing for globals because it is a list of blocks
            global_text_lines = []
            for global_block in target_node:
                start_idx = global_block['start_line'] - 1
                end_idx = global_block['end_line']
                node_text_lines = file_node.get('text_lines', [])[start_idx:end_idx]
                
                # 动态Extract文本
                if ''.join(node_text_lines).strip():
                    global_text_lines.extend(node_text_lines)
                    global_text_lines.append("...")  # 添加分隔符

            result = {
                "name": "global",
                "blocks": target_node,
                "text": "\n".join(global_text_lines)
            }
            return result

        elif location.component_type == ComponentType.FUNCTION:
            for func in file_node.get('functions', []):
                if func['name'] == location.member_name:
                    target_node = func
                    break
        
        elif location.component_type == ComponentType.CLASS:
            for cls in file_node.get('classes', []):
                if cls['name'] == location.class_name:
                    target_node = cls
                    break
        
        elif location.component_type == ComponentType.METHOD:
            for cls in file_node.get('classes', []):
                if cls['name'] == location.class_name:
                    for method in cls.get('methods', []):
                        if method['name'] == location.member_name:
                            target_node = method
                            break
                    if target_node:
                        break

        if target_node:
            # 动态Extract并添加文本
            start_idx = target_node['start_line'] - 1
            end_idx = target_node['end_line']
            node_text_lines = file_node.get('text_lines', [])[start_idx:end_idx]
            
            result = target_node.copy()
            result['text'] = '\n'.join(node_text_lines)
            return result
            
        return None

    def get_imports_of_file(self, path: str) -> Optional[List[str]]:
        """
        Retrieves a list of all import statements for a given file by using regex
        to find single and multi-line imports.

        Args:
            path: The relative path to the file (e.g., 'gunicorn/http/wsgi.py').

        Returns:
            A list of strings, where each string is a complete import statement.
            Returns None if the file is not found in the structure.
        """
        path_parts = path.replace(os.path.sep, '/').split('/')
        current_level = self.tree

        # 1. Navigate to the file node
        for part in path_parts:
            if not isinstance(current_level, dict) or part not in current_level:
                return None  # File not found
            current_level = current_level.get(part)

        file_node = current_level

        if not isinstance(file_node, dict) or file_node.get(self.NODE_TYPE_KEY) != self.NODE_TYPE_FILE:
            return None  # Path does not point to a valid file node

        # 2. Get all text lines from the node and join them
        all_lines = file_node.get('text_lines', [])
        if not all_lines:
            return []
        
        full_text = "\n".join(all_lines)

        # 3. Use regex to find all import statements (single and multi-line)
        # This pattern first looks for multi-line 'from ... import (...)', then any single-line 'from/import'.
        # The re.MULTILINE flag allows `^` to match at the beginning of each line.
        # pattern = re.compile(
        #     r"^from\s+[\w\.]+\s+import\s+\([\s\S]*?\)|^(?:from|import)\s+.*",
        #     re.MULTILINE
        # )

        pattern = self.language_manager.get_imports_pattern()
        
        if pattern is None:
            return []
        
        imports = pattern.findall(full_text)
        imports = [line.replace('\n', '') for line in imports]

        return imports
    

    def get_file_node(self, path):
        path_parts = path.replace(os.path.sep, '/').split('/')
        current_level = self.tree

        for part in path_parts:
            if not isinstance(current_level, dict) or part not in current_level:
                return None
            current_level = current_level.get(part)

        return current_level

    def get_class_names(self, path) -> List[str]:
        file_node = self.get_file_node(path)
        return [claz['name'] for claz in file_node["classes"]]

    def build_file_summary(self, path: str) -> str:
        """
        Summarizes a script, printing class names, static functions, and class methods
        in a structured format.

        Args:
            path (str): The relative path to the file (e.g., 'gunicorn/http/wsgi.py').

        Returns:
            str: A formatted string summarizing the file's structure.
        """
        path_parts = path.replace(os.path.sep, '/').split('/')
        current_level = self.tree

        # 1. Traverse the nested dictionary to locate the file node
        for part in path_parts:
            if not isinstance(current_level, dict) or part not in current_level:
                return f"Error: File not found in structure tree: {path}"
            current_level = current_level.get(part)

        file_node = current_level

        # Check if we have a valid file node
        if not isinstance(file_node, dict) or file_node.get(self.NODE_TYPE_KEY) != self.NODE_TYPE_FILE:
            return f"Error: Path does not point to a file node: {path}"
        
        if "error" in file_node:
            return f"Error: File '{path}' failed to parse: {file_node['error']}"

        # 2. Extract information
        classes = file_node.get('classes', [])
        static_functions = file_node.get('functions', [])

        class_names = [c['name'] for c in classes]
        static_function_names = [f['name'] for f in static_functions]
        
        class_functions_summary = []
        for cls in classes:
            class_name = cls.get('name', 'anonymous')
            methods = cls.get('methods', [])
            method_names = [m.get('name', 'anonymous') for m in methods]
            class_functions_summary.append(f"{class_name}: {method_names}")

        # 3. Format the output string
        output_lines = [
            f"file: {path}",
            f"\tclass: {class_names}",
            f"\tstatic functions:  {static_function_names}",
            f"\tclass functions: ["
        ]
        # Add each class method summary line, indented
        for summary_line in class_functions_summary:
            output_lines.append(f"\t\t{summary_line}")

        output_lines.append("\t]")

        return "\n".join(output_lines)
    

    def get_all_files(self) -> List[str]:
        """
        Retrieves a list of all file paths included in the structure.

        Returns:
            List[str]: A list of file paths, relative to the project root.
        """
        all_paths = []
        # Start the recursive search from the root of the tree with an empty path prefix
        self._recursive_get_files(self.tree, [], all_paths)
        return sorted(all_paths) # Sort for consistent output

    def _recursive_get_files(self, current_node: Dict[str, Any], current_path: List[str], all_paths: List[str]):
        """
        A helper method to recursively traverse the tree and collect file paths.

        Args:
            current_node (Dict[str, Any]): The current dictionary (directory) to search in.
            current_path (List[str]): The list of path parts leading to the current node.
            all_paths (List[str]): The list to which discovered file paths are added.
        """
        for name, node_content in current_node.items():
            if name == self.NODE_TYPE_KEY:
                continue

            # Create the path for the current item
            new_path_parts = current_path + [name]
            
            if isinstance(node_content, dict):
                node_type = node_content.get(self.NODE_TYPE_KEY)
                
                if node_type == self.NODE_TYPE_FILE:
                    # If it's a file, join the path parts and add to the list
                    full_path = "/".join(new_path_parts)
                    all_paths.append(full_path)
                elif node_type == self.NODE_TYPE_DIR:
                    # If it's a directory, recurse into it
                    self._recursive_get_files(node_content, new_path_parts, all_paths)

    def get_structure_string(self, indent_size: int = 4) -> str:
        """
        Generates a string representation of the directory and file structure.

        Args:
            indent_size (int): The number of spaces for each indentation level. Defaults to 4.

        Returns:
            str: A string showing the hierarchical structure of the project.
        """
        lines = []
        indent_str = ' ' * indent_size
        self._recursive_build_structure_string(self.tree, 0, indent_str, lines)
        return "\n".join(lines)

    def _recursive_build_structure_string(self, current_node: Dict[str, Any], level: int, indent_str: str, lines: List[str]):
        """
        A recursive helper to build the structure string with proper indentation.
        """
        prefix = indent_str * level
        
        # Separate directories and files to list directories first
        dirs = {}
        files = []
        for name, content in current_node.items():
            if name == self.NODE_TYPE_KEY:
                continue
            if isinstance(content, dict) and content.get(self.NODE_TYPE_KEY) == self.NODE_TYPE_DIR:
                dirs[name] = content
            else:
                files.append(name)
        
        # Process sorted directories first
        for dir_name in sorted(dirs.keys()):
            lines.append(f"{prefix}{dir_name}/")
            self._recursive_build_structure_string(dirs[dir_name], level + 1, indent_str, lines)
            
        # Then process sorted files
        for file_name in sorted(files):
            lines.append(f"{prefix}{file_name}")

    def map_output_locations_by_files(self, output_locations: List[str], file_names: List[str], keep_old_order=False):
        if keep_old_order:
            results = {fn: [] for fn in file_names}
        else:
            results = {}  # dict is insertion ordered
        current_file_name = None
        for loc in output_locations:
            for line in loc.splitlines():
                if line.strip().endswith(self.language_manager.code_suffix_set()):
                    current_file_name = line.strip()
                elif line.strip() and any(
                    line.startswith(w)
                    for w in ["line:", "function:", "class:", "variable:", "global"]
                ):
                    if current_file_name in file_names:
                        if current_file_name not in results:
                            results[current_file_name] = []
                        results[current_file_name].append(line.strip())
                    else:
                        pass

        for file_name in file_names:
            if file_name not in results:  # guard for new order case
                results[file_name] = []

        return results
    

    def transfer_location_strings_into_intervals(
        self,
        location_string_map_by_file,
        file,
        context_window=10,
        loc_interval=False,
        fine_grain_only=False,
        verbose=False,
    ) -> tuple[list, list]:
        """
        location string: like 'a.py\nfunction: func1'
        """
        assert file in location_string_map_by_file
        location_string_list = location_string_map_by_file[file]

        line_loc = []
        if isinstance(location_string_list, str):
            # if its a single loc
            location_string_list = [location_string_list]

        unrecognized_locs = []

        for location_string in location_string_list:
            current_class_name = ""
            for loc in location_string.splitlines():
                # handle cases like "class: MyClass.my_method"
                if loc.startswith("class: ") and "." not in loc:
                    loc = loc[len("class: ") :].strip()
                    relevant_class = self.find(CodeLocation.for_class(file, loc))

                    if relevant_class:
                        line_loc.append(
                            (relevant_class["start_line"], relevant_class["end_line"])
                        )
                        current_class_name = loc
                    else:
                        unrecognized_locs.append(loc)

                elif loc.startswith("function: ") or "." in loc:
                    full_loc = loc
                    loc = loc.split(":", 1)[-1].strip()

                    if "." in loc:
                        # assume its a method within a class
                        method_name = loc.split(".")[1]
                        class_name = loc.split(".")[0]

                        relevant_method = self.find(CodeLocation.for_method(file, class_name, method_name))

                        if relevant_method:
                            line_loc.append(
                                (
                                    relevant_method["start_line"],
                                    relevant_method["end_line"],
                                )
                            )
                        else:
                            unrecognized_locs.append(loc)

                    else:
                        relevant_function = self.find(CodeLocation.for_function(file, loc))
                        if relevant_function: 
                            line_loc.append(
                                (
                                    relevant_function["start_line"],
                                    relevant_function["end_line"],
                                )
                            )
                        else:
                            if current_class_name != "":
                                # check if its a method name belongs to recent class
                                relevant_method = self.find(CodeLocation.for_method(file, current_class_name, loc))
                                if relevant_method:
                                    line_loc.append(
                                        (
                                            relevant_method["start_line"],
                                            relevant_method["end_line"],
                                        )
                                    )
                                else:
                                    unrecognized_locs.append(loc)
                            else:
                                # look for it in any class, only add if there is unique method name in the file
                                relevant_methods = []
                                for cls_name in self.get_class_names(file):
                                    relevant_method = self.find(CodeLocation.for_method(file, cls_name, loc))
                                    if relevant_method:
                                        relevant_methods.append(relevant_method)
                                if len(relevant_methods) == 1:
                                    line_loc.append(
                                        (
                                            relevant_methods[0]["start_line"],
                                            relevant_methods[0]["end_line"],
                                        )
                                    )
                                else:
                                    unrecognized_locs.append(loc)
                elif loc.startswith("global"):
                    file_node = self.get_file_node(file)
                    text_lines = file_node['text_lines']
                    global_info = self.find(CodeLocation.for_global(file))
                    if global_info:
                        global_blocks = global_info["blocks"]
                        for block in global_blocks:
                            code_lines = text_lines[block["start_line"]-1 : block["end_line"]]
                            if "".join(code_lines).strip():
                                # if the block is not empty
                                line_loc.append(
                                    (block["start_line"], block["end_line"])
                                )
                    else:
                        unrecognized_locs.append(loc) 

                else:
                    if loc.strip():
                        unrecognized_locs.append(loc)
                    # assert False

        # Fine-grained-only loc: Remove intervals that are supersets of another.
        if fine_grain_only:
            filtered_line_loc = []
            for st, en in line_loc:
                if filtered_line_loc:
                    last_st, last_en = filtered_line_loc[-1]
                    # If the current interval is a more fine-grained loc, remove the superset.
                    if last_st <= st and en <= last_en:
                        filtered_line_loc.pop()
                filtered_line_loc.append((st, en))
            line_loc = filtered_line_loc

        # compute max min
        # TODO: think of strategies to do bunched up lines
        # TODO: e.g., we can have multiple code segments (right now, its just one)

        content = self.get_file_node(file)['text_lines']

        if len(line_loc) == 0:
            return [], []

        # max_line = min(max(line_loc) + context_window, len(content))
        # min_line = max(min(line_loc) - context_window, 0)
        #
        # return line_loc, max_line, min_line

        if verbose:
            print("Unrecognized locs:")
            for loc in unrecognized_locs:
                print(loc)

        # compute overlapping locations instead
        if loc_interval:
            contextual_line_loc = []
            for loc in line_loc:
                # Clip the context window to the beginning and end of the file
                max_line = max(min(loc[1] + context_window, len(content)), 0)
                min_line = min(max(loc[0] - context_window, 0), len(content))
                contextual_line_loc.append((min_line, max_line))

            return line_loc, merge_intervals(contextual_line_loc)
        else:
            # defaulting to max min
            max_line = min(max([loc[1] for loc in line_loc]) + context_window, len(content))
            min_line = max(min([loc[0] for loc in line_loc]) - context_window, 0)

            return line_loc, [(min_line, max_line)]

    def get_location(self, file: str, line_number: int) -> Optional['CodeLocation']:
        """
        Finds the CodeLocation for a specific line number within a given file.

        The search prioritizes the most specific location: method, then class, 
        then top-level function, and finally global scope.

        Args:
            file (str): The relative path to the file (e.g., 'src/main.py').
            line_number (int): The line number to locate (1-indexed).

        Returns:
            Optional[CodeLocation]: The corresponding CodeLocation if found, otherwise None.
        """
        file_node = self.get_file_node(file)

        # Check for errors or if the node is not a valid file
        if not isinstance(file_node, dict) or file_node.get(self.NODE_TYPE_KEY) != self.NODE_TYPE_FILE:
            # self.get_file_node returns an error string on failure
            print(f"Could not retrieve a valid file node for: {file}")
            return None

        # 1. Search within classes and their methods first
        for cls in file_node.get('classes', []):
            if cls['start_line'] <= line_number <= cls['end_line']:
                # The line is within this class. Now check its methods.
                for method in cls.get('methods', []):
                    if method['start_line'] <= line_number <= method['end_line']:
                        # Found the specific method
                        return CodeLocation.for_method(file, cls['name'], method['name'])
                
                # If not in any method, it's within the class definition itself
                return CodeLocation.for_class(file, cls['name'])

        # 2. If not in a class, search top-level functions
        for func in file_node.get('functions', []):
            if func['start_line'] <= line_number <= func['end_line']:
                return CodeLocation.for_function(file, func['name'])

        # 3. If not in a function, check global blocks
        for g_block in file_node.get('globals', []):
            if g_block['start_line'] <= line_number <= g_block['end_line']:
                return CodeLocation.for_global(file)

        # 4. If the line number doesn't fall into any defined component, return None
        return None

def checkout_commit(repo_path, commit_id):
    """Checkout the specified commit in the given local git repository.
    :param repo_path: Path to the local git repository
    :param commit_id: Commit ID to checkout
    :return: None
    """
    try:
        # Change directory to the provided repository path and checkout the specified commit
        print(f"Checking out commit {commit_id} in repository at {repo_path}...")
        subprocess.run(["git", "-C", repo_path, "checkout", commit_id], check=True)
        print("Commit checked out successfully.")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running git command: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

TOP_FOLDER = "default"

def clone_repo(repo_name, repo_playground):
    try:

        print(
            f"Cloning repository from https://github.com/{repo_name}.git to {repo_playground}/{TOP_FOLDER}..."
        )
        subprocess.run(
            [
                "git",
                "clone",
                f"https://github.com/{repo_name}.git",
                f"{repo_playground}/{TOP_FOLDER}",
            ],
            check=True,
        )
        print("Repository cloned successfully.")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running git command: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def get_before_after_ts_structure_from_scratch(
    repo_name, commit_id, patch, instance_id, repo_playground, language
):
    # Generate a temperary folder and add uuid to avoid collision
    repo_playground = os.path.join(repo_playground, str(uuid.uuid4()))

    # assert playground doesn't exist
    assert not os.path.exists(repo_playground), f"{repo_playground} already exists"

    # create playground
    os.makedirs(repo_playground)

    # TODO: try
    try:
        clone_repo(repo_name, repo_playground)
        checkout_commit(f"{repo_playground}/{TOP_FOLDER}", commit_id)
        before_structure = CodeStructure(f"{repo_playground}/{TOP_FOLDER}", language)
        before_structure.filter_none_code()
        before_structure.filter_test_files()
        # apply patch and get the after structure
        patch_file_path = os.path.join(repo_playground, "patch.diff")
        with open(patch_file_path, "w") as patch_file:
            patch_file.write(patch)
        try:
            subprocess.run(
                [
                    "git",
                    "-C",
                    f"{repo_playground}/{TOP_FOLDER}",
                    "apply",
                    "--whitespace=nowarn",
                    "../patch.diff",
                ],
                check=True,
            )
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while applying the patch for instance {instance_id}: {e}")
            raise e
        # get the after structure
        after_structure = CodeStructure(f"{repo_playground}/{TOP_FOLDER}", language)
        after_structure.filter_none_code()
        after_structure.filter_test_files()
    except Exception as e:
        print("Error occured while parsing:", e)
        subprocess.run(
            ["rm", "-rf", f"{repo_playground}/{TOP_FOLDER}"], check=True
        )
        return None, None
    # clean up
    subprocess.run(
        ["rm", "-rf", f"{repo_playground}/{TOP_FOLDER}"], check=True
    )
    return before_structure, after_structure

def get_completion_ts_structure_from_scratch(
    instance_id: str, repo_name, base_commit, patch, completion_task, repo_playground, language
):
    # Generate a temperary folder and add uuid to avoid collision
    repo_playground = os.path.join(repo_playground, str(uuid.uuid4()))

    # assert playground doesn't exist
    assert not os.path.exists(repo_playground), f"{repo_playground} already exists"

    # create playground
    os.makedirs(repo_playground)

    # get the mask patch
    mask_patch = completion_task['task_patch']

    # TODO: try
    try:
        clone_repo(repo_name, repo_playground)
        checkout_commit(f"{repo_playground}/{TOP_FOLDER}", base_commit)
        # before_structure = CodeStructure(f"{repo_playground}/{TOP_FOLDER}", language)
        # before_structure.filter_none_code()
        # before_structure.filter_test_files()
        # apply patch and get the after structure
        patch_file_path = os.path.join(repo_playground, "patch.diff")
        with open(patch_file_path, "w") as patch_file:
            patch_file.write(patch)
        try:
            subprocess.run(
                [
                    "git",
                    "-C",
                    f"{repo_playground}/{TOP_FOLDER}",
                    "apply",
                    "--whitespace=nowarn",
                    "../patch.diff",
                ],
                check=True,
            )
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while applying the patch for instance {instance_id}: {e}")
            raise e
        # get the after structure
        after_structure = CodeStructure(f"{repo_playground}/{TOP_FOLDER}", language)
        after_structure.filter_none_code()
        after_structure.filter_test_files()

        # apply mask patch and get the before structure before the completion
        mask_patch_file_path = os.path.join(repo_playground, "mask_patch.diff")
        with open(mask_patch_file_path, "w") as mask_patch_file:
            mask_patch_file.write(mask_patch)
        try:
            subprocess.run(
                [
                    "git",
                    "-C",
                    f"{repo_playground}/{TOP_FOLDER}",
                    "apply",
                    "--whitespace=nowarn",
                    "../mask_patch.diff",
                ],
                check=True,
            )
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while applying the mask patch for instance {instance_id}: {e}")
            raise e
        
        before_structure = CodeStructure(f"{repo_playground}/{TOP_FOLDER}", language)
        before_structure.filter_none_code()
        before_structure.filter_test_files()

    except Exception as e:
        print("Error occured while parsing:", e)
        subprocess.run(
            ["rm", "-rf", f"{repo_playground}/{TOP_FOLDER}"], check=True
        )
        return None, None
    # clean up
    subprocess.run(
        ["rm", "-rf", f"{repo_playground}/{TOP_FOLDER}"], check=True
    )
    return before_structure, after_structure

