Generated with slop-forensics

Using (human written) outputs sourced from:
- Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT
- Project Gutenberg
- Additional set of diverse fiction novels



#!/usr/bin/env python3
"""
Convert the Nitral-AI ShareGPT dataset into a JSONL file that the
slop-forensics analysis pipeline can read directly.
"""

import os, json, logging
from tqdm import tqdm
from datasets import load_dataset, disable_caching

# ---------- config ----------
OUTPUT_PATH = "results/datasets/generated_Nitral-AI__human.jsonl"
SOURCE_NAME  = "Nitral-AI"         # kept for consistency with other scripts
MODEL_FIELD  = "human-authored"    # any string is fine – it’s just a label
# -----------------------------

logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
disable_caching()

def main():
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    ds = load_dataset("Nitral-AI/Reddit-SFW-Writing_Prompts_ShareGPT", split="train")

    with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
        kept = 0
        for idx, row in enumerate(tqdm(ds, desc="Extracting GPT turns")):
            # locate first GPT message in the ShareGPT conversation
            gpt_msg = None
            for msg in row.get("conversations", []):
                if isinstance(msg, dict) and msg.get("from") == "gpt":
                    gpt_msg = msg.get("value", "").strip()
                    break
            if not gpt_msg:
                continue  # skip rows without a GPT response

            record = {
                "source": SOURCE_NAME,
                "id": idx,
                "prompt": row.get("title", ""),   # not used by analysis but nice to keep
                "model": MODEL_FIELD,
                "output": gpt_msg
            }
            f_out.write(json.dumps(record, ensure_ascii=False) + "\n")
            kept += 1

    logging.info(f"Finished. Wrote {kept} records to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()


# Then:
python3 scripts/slop_profile.py   --input-dir results/datasets   --analysis-output-dir results/analysis_human   --combined-output-file results/human_slop_profile.json   --top-n 1000000 --max-items 9999999