import json
import logging
import os
import re
import jsonlines
import subprocess
import sys

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_content_from_tag(tag_name: str, text: str) -> list[str]:
    """
    Extracts text content from between a pair of standard XML/HTML-like tags.

    The function looks for tags in the format:
    <tag_name>...</tag_name>

    Args:
        tag_name: The name of the tag to search for (e.g., "p", "div").
        text: The string to search within.

    Returns:
        A list of strings, where each string is the content
        extracted from between a pair of matching tags. Returns an
        empty list if no matching tags are found.
    """
    # Escape the tag_name to handle potential special regex characters.
    escaped_tag_name = re.escape(tag_name)

    # Construct the regex pattern.
    # - `<{escaped_tag_name}>`: Matches the opening tag.
    # - `(.*?)`: A non-greedy capture group for the content.
    # - `</{escaped_tag_name}>`: Matches the closing tag.
    # - `re.DOTALL`: Allows '.' to match newline characters for multi-line content.
    pattern = re.compile(
        f"<{escaped_tag_name}>(.*?)</{escaped_tag_name}>",
        re.DOTALL
    )

    # findall() returns a list of all the captured groups (the content).
    matches = pattern.findall(text)
    return matches 

def read_json_objects(filename, field_names=None):
    file_extension = os.path.splitext(filename)[1]
    if file_extension == '.jsonl':
        try:
            with open(filename, 'r') as file:
                lines = file.readlines()
            items = []
            for line in lines:
                item = json.loads(line)
                if field_names is not None and isinstance(field_names, list):
                    new_item = {}
                    for field_name in item:
                        new_item[field_name] = item[field_name]
                items.append(item)
            return items
        except FileNotFoundError:
            logging.error("The file was not found.")
        except json.JSONDecodeError:
            logging.error("There was an error decoding the JSONL file.")
        except Exception as e:
            logging.error(f"An error occurred: {e}")
    elif file_extension == '.json':
        try:
            with open(filename, 'r') as file:
                data = json.load(file)
            items = []
            for item in data:
                items.append(item)
            return items
        except FileNotFoundError:
            logging.error("The file was not found.")
        except json.JSONDecodeError:
            logging.error("There was an error decoding the JSON file.")
        except Exception as e:
            logging.error(f"An error occurred: {e}")
    else:
        logging.error(f"Unknown file extension {file_extension}")
        return []


# def read_json_fields(filename):
#     try:
#         with open(filename, 'r') as file:
#             data = json.load(file)
#         return data
#     except FileNotFoundError:
#         logging.error("The file was not found.")
#     except json.JSONDecodeError:
#         logging.error("There was an error decoding the JSON file.")
#     except Exception as e:
#         logging.error(f"An error occurred: {e}")


def read_jsonl_field(filename, field_name='instruction'):
    try:
        with open(filename, 'r') as file:
            lines = file.readlines()
        output_fields = []
        for line in lines:
            item = json.loads(line)
            if field_name in item:
                output_fields.append(item[field_name])
        return output_fields
    except FileNotFoundError:
        logging.error("The file was not found.")
    except json.JSONDecodeError:
        logging.error("There was an error decoding the JSONL file.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


def read_json_field(filename, field_name='instruction'):
    file_extension = os.path.splitext(filename)[1]
    if file_extension == '.jsonl':
        return read_jsonl_field(filename, field_name)
    try:
        with open(filename, 'r') as file:
            data = json.load(file)
        output_fields = []
        for item in data:
            if field_name in item:
                output_fields.append(item[field_name])
        return output_fields
    except FileNotFoundError:
        logging.error(f"The file was not found. {filename}")
    except json.JSONDecodeError:
        logging.error("There was an error decoding the JSON file.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


def write_data_to_json_file(data, file_path):
    try:
        with open(file_path, 'w') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        logging.info(f"Data successfully written to {file_path}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


def write_data_to_jsonlines_file(data, file_path, mode='a'):
    try:
        with jsonlines.open(file_path, mode=mode) as writer:
            writer.write_all(data)
    except Exception as e:
        logging.error(f"An error occurred: {e}")


def create_parent_directory(file_path):
    """
    Creates the parent directories of a given file path if they do not exist.

    Args:
        file_path (str or Path): The path to the file.
    """
    from pathlib import Path
    file_path = Path(file_path)  # Ensure it's a Path object
    parent_directory = file_path.parent

    # Create parent directories recursively if they don't exist, and ignore if they already exist
    parent_directory.mkdir(parents=True, exist_ok=True)
    print(f"Parent directory '{parent_directory}' ensured to exist.")


def create_directory(dir_path):
    """
    Creates a directory of a given path if it does not exist.

    Args:
        dir_path (str or Path): The path to the directory.
    """
    from pathlib import Path
    dir_path = Path(dir_path)  # Ensure it's a Path object

    # Create dir recursively if they don't exist, and ignore if they already exist
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"Directory '{dir_path}' ensured to exist.")


def get_parent_directory(file_path):
    """
    Returns the parent directories of a given file path.

    Args:
        file_path (str or Path): The path to the file.
    """
    from pathlib import Path
    file_path = Path(file_path)  # Ensure it's a Path object
    return file_path.parent
