import praw
import json
import time
from tqdm import tqdm
from prawcore.exceptions import RequestException, ServerError

# ---------- CONFIG ----------
TOP_POSTS = 20
COMMENTS_PER_POST = 100
COMMENTS_TOTAL_PER_TOPIC = TOP_POSTS * COMMENTS_PER_POST
DATA_FILE = "topics.json"

# Rate-limit / retry settings
SLEEP_BETWEEN_TOPICS = 10
RETRY_DELAY = 10

# PRAW authentication
def authenticate_reddit():
    return praw.Reddit(
        client_id = 'YOUR_CLIENT_ID',
        client_secret = 'YOUR_CLIENT_SECRET',
        user_agent = "YOUR_USER_AGENT"
    )

# Load existing JSON (or initialize)
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)

# Save entire dataset once per topic
def save_data(filename, data):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Safely expand all 'MoreComments' objects
def safe_replace_more(comments):
    while True:
        try:
            comments.replace_more(limit=None)
            return
        except (RequestException, ServerError):
            print("Request/server error, retrying after delay...")
            time.sleep(RETRY_DELAY)

# Extract up to N highest-scored comments from a post
def extract_top_comments(post, limit):
    safe_replace_more(post.comments)
    all_comments = [
        c for c in post.comments.list()
        if c.author
           and c.distinguished is None
           and 'bot' not in c.author.name.lower()
           and c.author.name.lower() != 'automoderator'
           and len(c.body.split()) >= 5
    ]
    # Sort by score descending, take top `limit`
    top_comments = sorted(all_comments, key=lambda c: c.score, reverse=True)[:limit]
    return [c.body.strip() for c in top_comments]


# ----------- Main processing function -----------
def process_topics():
    reddit = authenticate_reddit()
    data = load_data(DATA_FILE)

    for entry in data:
        topic = entry['topic']

        # Skip if already processed
        if entry['processed']:
            print(f"✔ {topic}: already has been processed; skipping.")
            continue

        collected = []

        # limit to TOP_POSTS posts
        print(f"Searching posts for: {topic}...")
        search_results = list(reddit.subreddit('all').search(topic, sort='relevance', limit=TOP_POSTS))
        print(f"Retrieved {len(search_results)} posts for topic: {topic}")


        for post in tqdm(search_results, total=TOP_POSTS, desc="Posts"):
            print(f"Post: {post.title[:60]}…")
            try:
                post_views = extract_top_comments(post, COMMENTS_PER_POST)
                collected.extend(post_views)
                print(f"collected {len(post_views)} comments.")
            except Exception as e:
                print(f"Error on post: {e} (skipping)")

            # brief pause to respect rate limits
            time.sleep( SLEEP_BETWEEN_TOPICS / TOP_POSTS )

        # Trim or pad if needed
        entry['views'] = collected[:COMMENTS_TOTAL_PER_TOPIC]
        entry['processed'] = True
        save_data(DATA_FILE, data)
        print(f"Saved {len(entry['views'])} comments for topic: {topic}")

        # wait before next topic
        time.sleep(SLEEP_BETWEEN_TOPICS)

    print("\nAll topics processed.")

if __name__ == "__main__":
    process_topics()