import json
import os
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()
OpenAI_API_KEY = os.getenv("OpenAI_API_KEY")
client = OpenAI(api_key=OpenAI_API_KEY)

INPUT_FILE = "video_sponsorship_info.json"
OUTPUT_FILE = "video_sponsorship_info_censored.json"

PROMPT_TEMPLATE = """You will be given a sponsor message from a video transcript. Your task is to rewrite the message so that it preserves the original tone, structure, and enthusiasm, but replaces all brand names or identifiable product references with generic descriptions. The format and style should remain virtually identical.

If the transcript does not describe a sponsor message (i.e., there is no actual sponsorship content), return the exact string: NO_SPONSOR

Return only the modified transcript or NO_SPONSOR. DO NOT return any explanations, commentary, or extra text.

Examples:

Original:
"This video is sponsored by Raid Shadow Legends — one of the biggest mobile RPGs of all time!"
Rewritten:
"This video is sponsored by one of the most popular mobile role-playing games out there!"

Original:
"Thanks to NordVPN for sponsoring today's video. Get secure and private access to the internet with NordVPN!"
Rewritten:
"Thanks to a leading virtual private network service for sponsoring today's video. Get secure and private access to the internet with this VPN!"

Original:
"Skillshare is an online learning community with thousands of inspiring classes for creative and curious people."
Rewritten:
"This video is brought to you by an online learning platform with thousands of inspiring classes for creative and curious people."

Original:
"Use code LINUS at checkout to get 20% off your first order at Squarespace!"
Rewritten:
"Use the creator’s promo code at checkout to get 20% off your first order at a popular website-building platform!"

Original:
"This episode is sponsored by BetterHelp — get matched with a licensed therapist and start living a happier life today."
Rewritten:
"This episode is sponsored by an online therapy service — get matched with a licensed professional and start living a happier life today."

Original:
"Smash that like button and don't forget to subscribe!"
Rewritten:
NO_SPONSOR

Now rewrite the following sponsor message:

\"\"\"{transcript}\"\"\"
"""

def call_gpt_4o(transcript):
    prompt = PROMPT_TEMPLATE.format(transcript=transcript.strip())
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

def main():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        data = json.load(f)

    updated = []

    for entry in tqdm(data, desc="Censoring transcripts"):
        original = entry.get("video_sponsorship_transcript", "")
        if not original.strip():
            continue

        try:
            result = call_gpt_4o(original)
            if result == "NO_SPONSOR":
                continue
            entry["video_sponsorship_transcript"] = result
            updated.append(entry)
        except Exception as e:
            print(f"Error processing video_id {entry.get('video_id', 'UNKNOWN')}: {e}")
            continue

        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(updated, f, indent=2, ensure_ascii=False)

    print(f"Saved censored file to {OUTPUT_FILE} ({len(updated)} entries kept)")

if __name__ == "__main__":
    main()

