import os
import pandas as pd
import numpy as np
from scipy.stats import norm

# 配置
ADNI_MERGE_PATH = '/data/qiuhui/data/adni/ADNIMERGE.csv'
DEMOGRAPHIC_PATH = '/data/qiuhui/data/adni/PTDEMOG_19Sep2024.csv'
MEDHIST_PATH = '/data/qiuhui/data/adni/MEDHIST_19Sep2024.csv'
LABDATA_PATH = '/data/qiuhui/data/adni/LABDATA_19Sep2024_blood_analysis.csv'
IMAGE_ROOT = '/data/qiuhui/data/adni/images'
OUTPUT_FILE = './ADNI4.csv'

# 定义映射关系
_LABEL_MAP = {
    'CN': 'CN',
    'SMC': 'CN',
    'MCI': 'MCI',
    'EMCI': 'MCI',
    'LMCI': 'MCI',
    'AD': 'AD',
    'Dementia': 'AD',
    'Patient': 'AD'
}

hand_map = {
    1: 'Right',
    2: 'Left',
}

racial_map = {
    '1': 'American Indian or Alaskan Native',
    '2': 'Asian',
    '3': 'Native Hawaiian or Other Pacific Islander',
    '4': 'Black or African American',
    '5': 'White',
    '6': 'More than one race',
    '7': 'Unknown',
}

# 脑区名称映射
region_map = {
    'Ventricles': 'ventricular',
    'Hippocampus': 'hippocampal',
    'WholeBrain': 'whole brain',
    'Entorhinal': 'entorhinal cortex',
    'Fusiform': 'fusiform gyrus',
    'MidTemp': 'middle temporal gyrus'
}

# 医疗史名称映射
medhis_map = {"mh18surg": "Major Surgical Procedures",
 "mh17mali": "Malignancy",
 "mh16smok": "Smoking",
 "mh15drug": "Drug Abuse",
 "mh14alch": "Alcohol Abuse",
 "mh13alle": "Allergies or Drug Sensitivities",
 "mh12rena": "Renal-Genitourinary",
 "mh11hema": "Hematopoietic-Lymphatic",
 "mh10gast": "Gastrointestinal",
 "mh9endo": "Endocrine-Metabolic",
 "mh8muscl": "Musculoskeletal",
 "mh7derm": "Dermatologic-Connective Tissue",
 "mh6hepat": "Hepatic",
 "mh5resp": "Respiratory",
 "mh4card": "Cardiovascular",
 "mh3head": "Head, Eyes, Ears, Nose and Throat",
 "mh2neurl": "Neurologic",
 "mhpsych": "Psychiatric"}

# VISCODE转换为月份数的函数
def viscode_to_months(viscode):
    """Convert VISCODE to months since baseline"""
    if viscode == 'bl':
        return 0
    elif viscode.startswith('m'):
        try:
            return int(viscode[1:])
        except:
            return 0
    else:
        return 0  # 处理其他情况如'sc'等

# 加载ADNIMERGE数据
print("Loading ADNIMERGE data...")
adni_merge_df = pd.read_csv(ADNI_MERGE_PATH)

# 添加当前年龄计算
print("Calculating current age...")
adni_merge_df['MONTHS_FROM_BL'] = adni_merge_df['VISCODE'].apply(viscode_to_months)
adni_merge_df['CURRENT_AGE'] = adni_merge_df['AGE'] + (adni_merge_df['MONTHS_FROM_BL'] / 12.0)

# 确保数值列被正确解析为浮点数
for col in ['PIB', 'AV45', 'FBB', 'ABETA', 'TAU', 'PTAU']:
    adni_merge_df[col] = pd.to_numeric(adni_merge_df[col], errors='coerce')

adni_merge_df = adni_merge_df[['RID', 'PTID', 'VISCODE', 'EXAMDATE', 'DX', 'AGE', 'CURRENT_AGE', 'PTGENDER', 
                              'PTEDUCAT', 'MMSE', 'MOCA', 'CDRSB', 'LDELTOTAL', 'APOE4',
                              'Ventricles', 'Hippocampus', 'WholeBrain', 
                              'Entorhinal', 'Fusiform', 'MidTemp',
                              'PIB', 'AV45', 'FBB', 'ABETA', 'TAU', 'PTAU']]

# 创建参考组数据 (健康对照组CN的数据)
print("Creating reference group for brain region Z-scores...")
cn_df = adni_merge_df[
    (adni_merge_df['DX'] == 'CN') & 
    (adni_merge_df['AGE'].notna())
].copy()

# 添加年龄分组
cn_df['age_group'] = pd.cut(cn_df['CURRENT_AGE'], 
                               bins=[50, 60, 70, 80, 90, 100],
                               labels=['50-59', '60-69', '70-79', '80-89', '90+'])

# 计算每个脑区的参考统计量
reference_stats = {}
regions = ['Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp']

for region in regions:
    region_stats = {}
    for age_group in cn_df['age_group'].cat.categories:
        for gender in ['Male', 'Female']:
            group_data = cn_df[
                (cn_df['age_group'] == age_group) & 
                (cn_df['PTGENDER'] == gender) &
                (cn_df[region].notna())
            ][region]
            
            if len(group_data) > 5:  # 确保有足够的数据点
                # 移除异常值
                q1 = group_data.quantile(0.25)
                q3 = group_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                
                filtered_data = group_data[
                    (group_data >= lower_bound) & 
                    (group_data <= upper_bound)
                ]
                
                if len(filtered_data) > 5:
                    mean = filtered_data.mean()
                    std = filtered_data.std()
                    region_stats[(age_group, gender)] = (mean, std)
    
    reference_stats[region] = region_stats

# 计算Z-score的函数（使用当前年龄）
def calculate_zscore(region, value, current_age, gender):
    if pd.isna(current_age) or pd.isna(gender) or pd.isna(value):
        return None, None, None
    
    # 确定年龄组
    age_groups = ['50-59', '60-69', '70-79', '80-89', '90+']
    age_group = None
    for i, upper_bound in enumerate([60, 70, 80, 90, 100]):
        if current_age < upper_bound:
            age_group = age_groups[i]
            break
    
    # 获取参考统计量
    region_stats = reference_stats.get(region, {})
    stats = region_stats.get((age_group, gender), None)
    
    if stats is None:
        return None, None, None
    
    mean, std = stats
    if std == 0:  # 避免除以零
        return None, None, None
    
    zscore = (value - mean) / std
    percentile = norm.cdf(zscore) * 100
    percent_diff = ((value - mean) / mean) * 100 if mean != 0 else 0
    
    return zscore, percentile, percent_diff

# 生成脑区描述文本的函数
def generate_region_description(region, value, zscore, percentile, percent_diff):
    region_name = region_map.get(region, region)
    
    if zscore is None or percentile is None:
        return f"{region_name} volume measures {value:,.0f} mm³."
    
    # 确定严重程度描述
    if abs(zscore) > 3:
        severity = "profound"
    elif abs(zscore) > 2:
        severity = "significant"
    elif abs(zscore) > 1.5:
        severity = "moderate"
    elif abs(zscore) > 1:
        severity = "mild"
    else:
        severity = "normal volume"
    
    # 确定变化方向
    if (zscore > 1):
        direction = "enlargement"
    elif (zscore < -1):
        direction = "atrophy"
    else:
        direction = ""
    
    # 生成描述文本
    description = (
        f"{region_name} volume measures {value:,.0f} mm³, "
        f"{abs(percent_diff):.1f}% {'above' if percent_diff > 0 else 'below'} "
        f"the reference mean. With a Z-score of {zscore:.2f} "
        f"({percentile:.1f}th percentile), this represents {severity} {direction}"
    )
    
    return description

# 加载其他数据
print("Loading additional data...")
demographic_df = pd.read_csv(DEMOGRAPHIC_PATH)
medhist_df = pd.read_csv(MEDHIST_PATH)
labdata_df = pd.read_csv(LABDATA_PATH)

# 构建文本字典
print("Building text dictionary...")
text_dict = {}
for idx, row in adni_merge_df.iterrows():
    ptid = row['PTID']
    examdate = row['EXAMDATE']
    
    # 只处理有年龄和诊断的数据
    if pd.isna(row['CURRENT_AGE']) or pd.isna(row['DX']):
        continue
    
    # 初始化条目
    if ptid not in text_dict:
        text_dict[ptid] = {}
    
    # 添加基本数据
    text_dict[ptid][examdate] = {
        'age': row['CURRENT_AGE'],  # 使用计算后的当前年龄
        'gender': row['PTGENDER'],
        'educ': row['PTEDUCAT'],
        'mmse': row['MMSE'],
        'moca': row['MOCA'],
        'cdr': row['CDRSB'],
        'ldeltotal': row['LDELTOTAL'],
        'apoe': row['APOE4'],
        'label': row['DX'],
        'regions': {region: row[region] for region in regions},
        # 添加生物标志物
        'biomarkers': {
            'PIB': row['PIB'],
            'AV45': row['AV45'],
            'FBB': row['FBB'],
            'ABETA': row['ABETA'],
            'TAU': row['TAU'],
            'PTAU': row['PTAU']
        }
    }

print("Adding demographic data...")
# 确保日期列是datetime类型
demographic_df['USERDATE'] = pd.to_datetime(demographic_df['USERDATE'])
for ptid in text_dict.keys():
    for examdate in text_dict[ptid].keys():
        # 将examdate转换为datetime对象
        exam_date = pd.to_datetime(examdate)
        # 获取同一年份的记录
        same_year = demographic_df[
            (demographic_df['PTID'] == ptid) &
            (demographic_df['USERDATE'].dt.year == exam_date.year)
        ]
        # 检查是否有同一年份的记录
        if not same_year.empty:
            # 计算日期差值（绝对值）
            same_year = same_year.copy()
            same_year['DATE_DIFF'] = (same_year['USERDATE'] - exam_date).abs()
            # 筛选60天内的记录
            within_60_days = same_year[same_year['DATE_DIFF'] <= pd.Timedelta(days=60)]
            if not within_60_days.empty:
                # 取日期最接近的记录
                closest = within_60_days.sort_values('DATE_DIFF').iloc[0]
                hand = closest['PTHAND']
                racial = closest['PTRACCAT']
                text_dict[ptid][examdate]['demographic'] = {
                    'Hand': hand_map.get(hand, 'Unknown'),
                    'Racial': racial_map.get(str(racial), 'Unknown')
                }

print("Adding medical history...")
med_fields = [
    'MHPSYCH', 'MH2NEURL', 'MH3HEAD', 'MH4CARD', 'MH5RESP',
    'MH6HEPAT', 'MH7DERM', 'MH8MUSCL', 'MH9ENDO', 'MH10GAST',
    'MH11HEMA', 'MH12RENA', 'MH13ALLE', 'MH14ALCH', 'MH15DRUG',
    'MH16SMOK', 'MH17MALI', 'MH18SURG'
]

# 确保日期列是datetime类型
medhist_df['USERDATE'] = pd.to_datetime(medhist_df['USERDATE'])

for ptid in text_dict.keys():
    for examdate in text_dict[ptid].keys():
        # 将examdate转换为datetime对象
        exam_date = pd.to_datetime(examdate)
        
        # 获取同一年份的记录
        same_year = medhist_df[
            (medhist_df['PTID'] == ptid) &
            (medhist_df['USERDATE'].dt.year == exam_date.year)
        ]
        
        # 检查是否有同一年份的记录
        if not same_year.empty:
            # 计算日期差值（绝对值）
            same_year = same_year.copy()
            same_year['DATE_DIFF'] = (same_year['USERDATE'] - exam_date).abs()
            
            # 筛选60天内的记录
            within_60_days = same_year[same_year['DATE_DIFF'] <= pd.Timedelta(days=60)]
            
            if not within_60_days.empty:
                # 取日期最接近的记录
                closest = within_60_days.sort_values('DATE_DIFF').iloc[0]
                
                medical_history = {}
                for field in med_fields:
                    value = closest[field]
                    if not pd.isna(value) and value != -1:
                        medical_history[field] = value
                
                text_dict[ptid][examdate]['medical_history'] = medical_history


print("Adding biospecimen data...")
lab_fields = [
    'AXT117', 'BAT126', 'HMT10', 'HMT100', 'HMT102', 'HMT11',
    'HMT12', 'HMT13', 'HMT15', 'HMT16', 'HMT17', 'HMT18', 'HMT19',
    'HMT2', 'HMT3', 'HMT40', 'HMT7', 'HMT8', 'HMT9', 'RCT1',
    'RCT11', 'RCT12', 'RCT13', 'RCT14', 'RCT1407', 'RCT1408',
    'RCT183', 'RCT19', 'RCT20', 'RCT29', 'RCT3', 'RCT392', 'RCT4',
    'RCT5', 'RCT6', 'RCT8', 'RCT9'
]

# 确保日期列是datetime类型
labdata_df['USERDATE'] = pd.to_datetime(labdata_df['USERDATE'])

for ptid in text_dict.keys():
    for examdate in text_dict[ptid].keys():
        # 将examdate转换为datetime对象
        exam_date = pd.to_datetime(examdate)
        
        # 获取同一年份的记录
        same_year = labdata_df[
            (labdata_df['PTID'] == ptid) &
            (labdata_df['USERDATE'].dt.year == exam_date.year)
        ]
        
        # 检查是否有同一年份的记录
        if not same_year.empty:
            # 计算日期差值（绝对值）
            same_year = same_year.copy()
            same_year['DATE_DIFF'] = (same_year['USERDATE'] - exam_date).abs()
            
            # 筛选60天内的记录
            within_60_days = same_year[same_year['DATE_DIFF'] <= pd.Timedelta(days=60)]
            
            if not within_60_days.empty:
                # 取日期最接近的记录
                closest = within_60_days.sort_values('DATE_DIFF').iloc[0]
                
                lab_data = {}
                for field in lab_fields:
                    value = closest[field]
                    # import pdb;pdb.set_trace()
                    if not pd.isna(value):
                        try:
                            value = float(value)
                            if value != -1:
                                lab_data[field] = value
                        except:
                            pass
                
                text_dict[ptid][examdate]['biospecimen'] = lab_data

# 创建实验室参考组数据 (健康对照组CN的基线数据)
print("Creating reference group for lab data Z-scores...")

# 获取所有CN的PTID列表
cn_ptids = adni_merge_df[
    (adni_merge_df['DX'] == 'CN')
]['PTID'].unique()

# 筛选实验室数据中的CN基线记录
lab_ref_df = labdata_df[
    (labdata_df['PTID'].isin(cn_ptids)) &
    (labdata_df['USERDATE'].notna())
].copy()

# 计算每个实验室指标的参考统计量
lab_reference_stats = {}
for field in lab_fields:
    # 只处理存在的列
    if field not in lab_ref_df.columns:
        continue
        
    # 清理数据：将非数值转换为NaN，-1视为缺失值
    clean_data = pd.to_numeric(lab_ref_df[field], errors='coerce')
    clean_data = clean_data.replace(-1, np.nan).dropna()
    
    # 确保有足够的数据点
    if len(clean_data) > 5:
        # 移除异常值
        q1 = clean_data.quantile(0.25)
        q3 = clean_data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # 过滤掉异常值
        filtered_data = clean_data[
            (clean_data >= lower_bound) & 
            (clean_data <= upper_bound)
        ]
        
        # 计算统计量
        if len(filtered_data) > 5:
            mean = filtered_data.mean()
            std = filtered_data.std()
            lab_reference_stats[field] = (mean, std)
            print(f"Calculated reference for {field}: mean={mean:.2f}, std={std:.2f} (based on {len(filtered_data)} samples)")
        else:
            print(f"Not enough samples after filtering for {field}: {len(filtered_data)}")
    else:
        print(f"Not enough samples for {field}: {len(clean_data)}")
# import pdb;pdb.set_trace()

# 计算实验室数据Z-score的函数
def calculate_lab_zscore(field, value):
    if pd.isna(value) or field not in lab_reference_stats:
        return None
    
    mean, std = lab_reference_stats[field]
    
    if std == 0:  # 避免除以零
        return None
    
    return (value - mean) / std

# 生成实验室数据描述的函数
def generate_lab_description(field, value, zscore):
    # 定义实验室指标名称映射
    lab_name_map = {
        'AXT117': 'Thyroid Stim. Hormone',
        'BAT126': 'Vitamin B12',
        'HMT10': 'Monocytes',
        'HMT100': 'MCH',
        'HMT102': 'MCHC',
        'HMT11': 'Eosinophils',
        'HMT12': 'Basophils',
        'HMT13': 'Platelets',
        'HMT15': 'Neutrophils',
        'HMT16': 'Lymphocytes',
        'HMT17': 'Monocytes',
        'HMT18': 'Eosinophils',
        'HMT19': 'Basophils',
        'HMT2': 'Hematocrit',
        'HMT3': 'RBC',
        'HMT40': 'Hemoglobin',
        'HMT7': 'WBC',
        'HMT8': 'Neutrophils',
        'HMT9': 'Lymphocytes',
        'RCT1': 'Total Bilirubin',
        'RCT11': 'Serum Glucose',
        'RCT12': 'Total Protein',
        'RCT13': 'Albumin',
        'RCT14': 'Creatine Kinase',
        'RCT1407': 'Alkaline Phosphatase',
        'RCT1408': 'LDH',
        'RCT183': 'Calcium (EDTA)',
        'RCT19': 'Triglycerides (GPO)',
        'RCT20': 'Cholesterol (High Performance)',
        'RCT29': 'Direct Bilirubin',
        'RCT3': 'GGT',
        'RCT392': 'Creatinine (Rate Blanked)',
        'RCT4': 'ALT (SGPT)',
        'RCT5': 'AST (SGOT)',
        'RCT6': 'Urea Nitrogen',
        'RCT8': 'Serum Uric Acid',
        'RCT9': 'Phosphorus'
    }
    
    name = lab_name_map.get(field, field)
    
    # 安全地格式化数值
    try:
        num_value = float(value)
        formatted_value = f"{num_value:.2f}"
    except (ValueError, TypeError):
        return f"{name}: {value}"

    # 确定严重程度描述   
    severity = ''
    direction = ''

    if zscore is not None:
        if abs(zscore) > 3:
            severity = "profound"
        elif abs(zscore) > 2:
            severity = "significant"
        elif abs(zscore) > 1.5:
            severity = "moderate"
        elif abs(zscore) > 1:
            severity = "mild"
        else:
            severity = "normal"
        
        # 确定变化方向
        if (zscore > 1):
            direction = "elevated"
        elif (zscore < -1):
            direction = "reduced"
        else:
            direction = ""
        
    return f"{name}: {formatted_value} ({severity} {direction})"

# 定义生物标志物的参考统计量（使用CN数据）
print("Creating reference group for biomarker Z-scores...")
biomarker_fields = ['PIB', 'AV45', 'FBB', 'ABETA', 'TAU', 'PTAU']
biomarker_ref_stats = {}

for field in biomarker_fields:
    # 获取CN基线数据
    cn_data = adni_merge_df[
        (adni_merge_df['DX'] == 'CN') & 
        (adni_merge_df[field].notna())
    ][field]
    
    if len(cn_data) > 5:
        # 移除异常值
        q1 = cn_data.quantile(0.25)
        q3 = cn_data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        filtered_data = cn_data[
            (cn_data >= lower_bound) & 
            (cn_data <= upper_bound)
        ]
        
        # 计算统计量
        if len(filtered_data) > 5:
            mean = filtered_data.mean()
            std = filtered_data.std()
            biomarker_ref_stats[field] = (mean, std)

# 计算生物标志物Z-score的函数
def calculate_biomarker_zscore(field, value):
    if pd.isna(value) or field not in biomarker_ref_stats:
        return None
    
    mean, std = biomarker_ref_stats[field]
    
    if std == 0:  # 避免除以零
        return None
    
    return (value - mean) / std

# 生成生物标志物描述的函数
def generate_biomarker_description(field, value, zscore):
    # 定义生物标志物名称映射
    biomarker_names = {
        'PIB': 'Pittsburgh compound B',
        'AV45': 'Florbetapir (AV45)',
        'FBB': 'Flobetaben (FBB)',
        'ABETA': 'Amyloid beta',
        'TAU': 'Total tau',
        'PTAU': 'Phosphorylated tau'
    }
    
    name = biomarker_names.get(field, field)
    
    try:
        num_value = float(value)
    except (ValueError, TypeError):
        return f"{name}: {value}"
    
    # 添加异常指示
    if zscore is not None:
        # 确定严重程度描述   
        if abs(zscore) > 3:
            severity = "profound"
        elif abs(zscore) > 2:
            severity = "significant"
        elif abs(zscore) > 1.5:
            severity = "moderate"
        elif abs(zscore) > 1:
            severity = "mild"
        else:
            severity = "normal"
        
        # 确定变化方向
        if (zscore > 1):
            direction = "elevated"
        elif (zscore < -1):
            direction = "reduced"
        else:
            direction = ""
        
    
    # 根据标志物类型添加单位
    if field in ['ABETA', 'TAU', 'PTAU']:
        unit = "pg/mL"
    else:
        unit = "SUVR"
    
    return f"{name}: {num_value:.2f} {unit} ({severity} {direction})"

# 处理图像并生成文本
print("Processing images and generating text...")
to_write = []
ptid_list = sorted(os.listdir(IMAGE_ROOT))

for ptid in ptid_list:
    ptid_path = os.path.join(IMAGE_ROOT, ptid)
    if not os.path.isdir(ptid_path):
        continue
    
    exam_dates = sorted(os.listdir(ptid_path))
    if '.DS_Store' in exam_dates:
        exam_dates.remove('.DS_Store')
    
    for exam_date in exam_dates:
        img_path = os.path.join(ptid_path, exam_date, 't1.nii.gz')
        if not os.path.exists(img_path):
            print('not exist: ',img_path, ', pass!')
            continue
        
        # 解析日期格式
        try:
            img_y, img_m, img_d = exam_date.split('-')
        except:
            continue
        
        # 在text_dict中查找匹配的记录
        matched_record = None
        for record_date, record_data in text_dict.get(ptid, {}).items():
            try:
                record_y, record_m, record_d = record_date.split('-')
                if img_y == record_y and abs(int(img_m) - int(record_m)) <= 2:
                    matched_record = record_data
                    break
            except:
                continue
        
        if matched_record is None:
            continue
        
        # 开始构建文本
        factors = matched_record
        text = ""
        imgfinding = ""
        diagnosis = ""
        
        # 添加基本信息
        if not pd.isna(factors['age']):
            text += f"Age is {factors['age']:.1f} years. "
        if factors['gender'] and not pd.isna(factors['gender']):
            text += f"Gender is {factors['gender']}. "
        if not pd.isna(factors['educ']):
            text += f"Education: {factors['educ']} years. "
        
        # 添加人口统计信息
        if 'demographic' in factors:
            demo = factors['demographic']
            text += f"Handedness: {demo['Hand']}. Race: {demo['Racial']}. "
        
        # 添加医疗历史
        if 'medical_history' in factors and factors['medical_history']:
            text += "Medical history: "
            med_items = []
            for condition, value in factors['medical_history'].items():
                if value == 1:
                    med_items.append(medhis_map.get(condition.lower(), condition))
            text += "; ".join(med_items) + ". "
        
        # 添加认知测试
        if not pd.isna(factors['mmse']):
            text += f"MMSE: {factors['mmse']}. "
        if not pd.isna(factors['moca']):
            text += f"MoCA: {factors['moca']}. "
        if not pd.isna(factors['ldeltotal']):
            text += f"Logical Memory: {factors['ldeltotal']}. "
        
        # 实验室数据部分 - 使用Z-score检测异常
        if 'biospecimen' in factors and factors['biospecimen']:
            text += "Laboratory findings: "
            lab_items = []
            for test, value in factors['biospecimen'].items():
                try:
                    num_value = float(value)
                except (ValueError, TypeError):
                    # lab_items.append(f"{test}: {value}")
                    continue
                
                # 计算Z-score
                zscore = calculate_lab_zscore(test, num_value)
                # print('zscore: ', zscore)
                if abs(zscore) > 2: # 正常范围，就不写;异常范围就写
                    # 生成描述
                    lab_desc = generate_lab_description(test, num_value, zscore)
                    lab_items.append(lab_desc)
            
            text += "; ".join(lab_items) + ". "
        
        # 添加APOE
        if not pd.isna(factors['apoe']):
            text += f"APOEε4 alleles: {factors['apoe']}. "
        
        # 生物标志物部分 - 使用Z-score检测异常
        if 'biomarkers' in factors:
            biomarkers = factors['biomarkers']
            bio_descriptions = []
            
            for marker, value in biomarkers.items():
                if pd.isna(value) or value is None:
                    continue
                
                try:
                    num_value = float(value)
                except (ValueError, TypeError):
                    continue
                
                # 计算Z-score
                zscore = calculate_biomarker_zscore(marker, num_value)
                
                # 生成描述
                bio_desc = generate_biomarker_description(marker, num_value, zscore)
                bio_descriptions.append(bio_desc)
            
            if bio_descriptions:
                text += "Biomarker levels: " + "; ".join(bio_descriptions) + ". "
        
        # 添加图像发现描述
        if 'regions' in factors:
            region_descriptions = []
            for region, value in factors['regions'].items():
                if not pd.isna(value):
                    zscore, percentile, percent_diff = calculate_zscore(
                        region, value, factors['age'], factors['gender']
                    )
                    region_desc = generate_region_description(
                        region, value, zscore, percentile, percent_diff
                    )
                    region_descriptions.append(region_desc)
            
            if region_descriptions:
                imgfinding = "Image findings: " + "; ".join(region_descriptions) + ". "
        
        # 添加诊断标签
        if factors['label'] and not pd.isna(factors['label']):
            dx_label = _LABEL_MAP.get(factors['label'], factors['label'])
            diagnosis = f"Diagnosis: {dx_label}."
        
        # 添加到输出列表
        to_write.append(f"{img_path}\t{imgfinding}\t{text}\t{diagnosis}\n")

# 写入输出文件
print(f"Writing {len(to_write)} records to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w') as f:
    for line in to_write:
        f.write(line)

print("Processing completed successfully!")