# scripts/fetch_GSE196343_metformin.py
"""
Metformin RNA-seq (GSE196343)
- Downloads processed XLSX with expression (RPKM) from GEO
- Builds group labels (Control vs Metformin)
- Computes simple DE on logCPM proxy (note: RPKM -> treated as abundance proxy)
- Saves DE table + figures
GEO page: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE196343
"""
import os, io, sys, zipfile, pathlib
import pandas as pd
import numpy as np
import requests

sys.path.append(str(pathlib.Path(__file__).resolve().parent))
from analysis_common import differential_expression, volcano_plot, heatmap_top_genes, simple_cpm

BASE = pathlib.Path(__file__).resolve().parents[1]
RAW = BASE / "data" / "raw"
PROC = BASE / "data" / "processed"
FIGS = BASE / "figures"

RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

# 1) Download XLSX (processed RPKM)
xlsx_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE196nnn/GSE196343/suppl/GSE196343_LStewart_3139-LVS-RPKM_SPratap.xlsx"
xlsx_path = RAW / "GSE196343_RPKM.xlsx"
if not xlsx_path.exists():
    r = requests.get(xlsx_url, timeout=60)
    r.raise_for_status()
    with open(xlsx_path, "wb") as f:
        f.write(r.content)

# 2) Load XLSX (assumes first sheet contains gene rows × sample columns or vice versa)
df = pd.read_excel(xlsx_path, sheet_name=0, engine="openpyxl")

# Try to auto-detect gene column
gene_col = None
for cand in ["gene", "Gene", "Symbol", "GeneSymbol", "Gene Symbol"]:
    if cand in df.columns:
        gene_col = cand
        break
if gene_col is None:
    # Sometimes first column is gene symbol
    gene_col = df.columns[0]

df = df.rename(columns={gene_col:"gene"})
df = df.dropna(subset=["gene"])

# Extract numeric columns as samples
sample_cols = [c for c in df.columns if c != "gene"]
expr = df[sample_cols].apply(pd.to_numeric, errors='coerce')
expr.index = df["gene"].astype(str)

# 3) Build group labels from sample names (PBS control vs Metformin)
# You may customize this mapping if the sheet uses different headers
group_labels = []
for c in expr.columns:
    name = c.lower()
    if "metformin" in name or "met" in name:
        group_labels.append("Metformin")
    else:
        group_labels.append("Control")

# 4) For DE, we treat RPKM as abundance proxy; compute DE with logCPM-based t-test (quick baseline)
res = differential_expression(expr, group_labels)
res_out = PROC / "GSE196343_metformin_de_results.csv"
res.to_csv(res_out)

# Figures
volcano_plot(res, out_png=str(FIGS / "GSE196343_metformin_volcano.png"),
             title="GSE196343 Metformin vs Control")

heatmap_top_genes(expr, res, group_labels,
                  out_png=str(FIGS / "GSE196343_metformin_heatmap.png"),
                  title="GSE196343 top DE (Metformin)")

print("Done. Results:")
print(res_out)
print(FIGS / "GSE196343_metformin_volcano.png")
print(FIGS / "GSE196343_metformin_heatmap.png")
