#!/usr/bin/env python3
# check_train_list.py
import os
DATA_ROOT = '/home/ubuntu-user/Documents/AtticusDon'  # <<< 改成你的 data root
SENT_DIR = os.path.join(DATA_ROOT, 'Sentences')
ORIG_LIST = os.path.join(SENT_DIR, 'Train.txt')
AUG_LIST = os.path.join(SENT_DIR, 'Train_augmented.txt')

def read_lines(path):
    if not os.path.exists(path):
        return []
    with open(path, 'r', encoding='utf-8') as f:
        return [l.strip() for l in f if l.strip()]

orig = read_lines(ORIG_LIST)
aug = read_lines(AUG_LIST) or orig

print("DATA_ROOT:", DATA_ROOT)
print("Sentences dir exists:", os.path.exists(SENT_DIR))
print("Original list lines:", len(orig))
print("Augmented list lines:", len(aug))
print()

# show sample lines
print("Sample original list (first 8):", orig[:8])
print("Sample augmented list (first 8):", aug[:8])
print()

# check each entry in augmented list whether file exists under Sentences/
missing_on_disk = []
for l in aug:
    normalized = l.replace('\\','/').lstrip('./')
    p = os.path.join(SENT_DIR, normalized)
    if not os.path.exists(p):
        missing_on_disk.append((l, p))
print("Entries listed but MISSING on disk:", len(missing_on_disk))
for a,b in missing_on_disk[:50]:
    print(" MISSING -> list:", a, "expected path:", b)
print()

# find synth files on disk
synth_files = []
for root,dirs,files in os.walk(SENT_DIR):
    for fn in files:
        if '__synth' in fn and fn.endswith('.npz'):
            rel = os.path.relpath(os.path.join(root,fn), SENT_DIR).replace('\\','/')
            synth_files.append(rel)
print("Synth files found on disk:", len(synth_files))
not_listed = [s for s in synth_files if s not in aug]
print("Synth files NOT present in augmented list:", len(not_listed))
for s in not_listed[:50]:
    print(" Synth NOT in list:", s)
