"""
====================================
Subtopic Distribution Analysis
====================================
This script computes and visualizes the distribution of classified messages across subtopics. 
It takes as input a CSV file containing messages with assigned subtopics and produces:
1. A CSV file summarizing the total number of messages per subtopic.
2. A bar plot visualizing the message distribution across subtopics.

-----------------------------------
USAGE INSTRUCTIONS:
-----------------------------------
Run the script using:
    python subtopic_human_llm.py

This will:
- Read classified messages from `../../result/subtopic/{data_prefix}/{data_prefix}_classified.csv`
- Compute the number of messages per subtopic
- Save the distribution results to `../../result/subtopic/{data_prefix}/{data_prefix}_subtopic_distribution.csv`
- Generate a horizontal bar plot and save it to `../../result/subtopic/{data_prefix}/{data_prefix}_subtopic_distribution.png`

-----------------------------------
OUTPUT FILES:
-----------------------------------
Subtopic Distribution CSV:
   - Contains subtopics and their corresponding message counts.
   - Stored at: `../../result/subtopic/{data_prefix}/{data_prefix}_subtopic_distribution.csv`

Subtopic Distribution Plot:
   - A horizontal bar plot of message distribution across subtopics.
   - Stored at: `../../result/subtopic/{data_prefix}/{data_prefix}_subtopic_distribution.png`

-----------------------------------
DEPENDENCIES:
-----------------------------------
Ensure the following Python packages are installed before running:
- `pandas`
- `matplotlib`

To install missing dependencies, run:
    pip install pandas matplotlib

-----------------------------------
NOTES:
-----------------------------------
- The script automatically creates the output directory if it does not exist.
- The bar plot provides an intuitive visualization of message distribution across subtopics.

"""

import os
import pandas as pd
import matplotlib.pyplot as plt

data_prefix = "20241028_153927_A__body_cleanse,__in_which_you_consume_only_particular_kinds_of_nutrients_over_1-3_days,_helps_your_body_to_eliminate_toxins_01JB9V4TTHV4FRSNK02H14T0X8"
classified_file = f"../../result/subtopic/{data_prefix}/{data_prefix}_classified.csv"
output_dir = f"../../result/subtopic/{data_prefix}"
distribution_csv = os.path.join(output_dir, f"{data_prefix}_subtopic_distribution.csv")
distribution_plot = os.path.join(output_dir, f"{data_prefix}_subtopic_distribution.png")
os.makedirs(output_dir, exist_ok=True)

def compute_subtopic_distribution(df):
    """
    Computes the distribution of messages across subtopics and stores the results in a CSV file.
    """
    # count total messages per subtopic
    subtopic_counts = df["Classified Subtopic"].value_counts().reset_index()
    subtopic_counts.columns = ["Subtopic", "Total Messages"]
    subtopic_counts.to_csv(distribution_csv, index=False)
    print(f"Subtopic distribution saved to: {distribution_csv}")

    return subtopic_counts

def plot_subtopic_distribution(subtopic_counts):
    """
    Generates and saves a bar plot visualizing the subtopic distribution.
    """
    plt.figure(figsize=(10, 6))
    plt.barh(subtopic_counts["Subtopic"], subtopic_counts["Total Messages"], color="skyblue")
    plt.xlabel("Total Messages")
    plt.ylabel("Subtopic")
    plt.title("Subtopic Distribution of Messages")
    plt.gca().invert_yaxis()  # invert to have the highest at the top
    plt.grid(axis="x", linestyle="--", alpha=0.7)
    plt.savefig(distribution_plot, bbox_inches="tight")
    plt.close()
    print(f"Subtopic distribution plot saved to: {distribution_plot}")

def main():
    """ Main function to compute and visualize subtopic distribution. """
    try:
        df = pd.read_csv(classified_file)
        subtopic_counts = compute_subtopic_distribution(df)
        plot_subtopic_distribution(subtopic_counts)

    except Exception as e:
        print(f"Error in main function: {e}")

if __name__ == "__main__":
    main()