================================================================================
📊 DATASET: enhancer_annotation
================================================================================

[Basic Info]
Total entries: 285
Columns: ['chromosome', 'start', 'end', 'gene', 'score', 'strand', 'split']
Splits: ['part2', 'part9', 'part5', 'part3', 'part8', 'part10', 'part6', 'part4', 'part7', 'part1']

[Chromosome Distribution]
chromosome
chr1     32
chr19    31
chr3     20
chr17    19
chr7     17
chr2     17
chr6     15
chr11    14
chr12    14
chr9     14
chrX     13
chr5     13
chr10    12
chr4     11
chr8      9
chr20     8
chr22     7
chr14     6
chr15     5
chr16     3
chr21     2
chr13     2
chr18     1
Name: count, dtype: int64

[Strand Distribution]
strand
.    285
Name: count, dtype: int64

[Sequence Length Stats]
Min: 100096, Max: 100096, Mean: 100096.00, Median: 100096.0

--- Loading HDF5 file: /media/data/BEND_backups/data/enhancer_annotation/enhancer_annotation.hdf5 ---

[Label Info]
Shape: (285,)
Multi-label classification (ragged 1D) detected.
Label counts (top 10):
Label 0.0: 220185
Label 1.0: 2685
================================================================================
📊 DATASET: gene_finding
================================================================================

[Basic Info]
Total entries: 5977
Columns: ['chromosome', 'start', 'end', 'transcript_id', 'strand', 'flank_left', 'flank_right', 'length', 'split']
Splits: ['train', 'test', 'valid']

[Chromosome Distribution]
chromosome
chr19    645
chr1     609
chr17    496
chr11    444
chr16    351
chr2     318
chr12    317
chrX     314
chr6     272
chr3     247
chr7     246
chr9     236
chr5     197
chr14    192
chr8     183
chr20    183
chr10    164
chr4     143
chr22    141
chr15    125
chr13     58
chr21     42
chr18     38
chrY      16
Name: count, dtype: int64

[Strand Distribution]
strand
-    3004
+    2973
Name: count, dtype: int64

[Sequence Length Stats]
Min: 1433, Max: 13000, Mean: 7453.56, Median: 7330.0

--- Loading HDF5 file: /media/data/BEND_backups/data/gene_finding/gene_finding.hdf5 ---

[Label Info]
Shape: (5977,)
Multi-label classification (ragged 1D) detected.
Label counts (top 10):
Label 0.0: 2651094
Label 1.0: 11364
Label 2.0: 11130235
Label 3.0: 11364
Label 4.0: 2762499
Label 5.0: 11740
Label 6.0: 11077339
Label 7.0: 11740
Label 8.0: 16882542
================================================================================
📊 DATASET: chromatin_accessibility
================================================================================

[Basic Info]
Total entries: 2062129
Columns: ['chromosome', 'start', 'end', 'score', '-', 'strand', 'label', 'split']
Splits: ['train', 'valid', 'test']

[Chromosome Distribution]
chromosome
1     184574
2     171581
3     136340
6     119380
5     119216
7     112409
4     107841
10    102452
8     101461
11    100759
12     98673
9      86118
17     77068
16     67135
15     66532
14     64770
13     59198
20     58184
X      56512
19     54140
18     52719
22     38151
21     26916
Name: count, dtype: int64

[Strand Distribution]
strand
+    2062129
Name: count, dtype: int64

[Sequence Length Stats]
Min: 512, Max: 512, Mean: 512.00, Median: 512.0
⚠️ No HDF5 file found.
================================================================================
📊 DATASET: histone_modification
================================================================================

[Basic Info]
Total entries: 625229
Columns: ['chromosome', 'start', 'end', 'score', '-', 'strand', 'label', 'split']
Splits: ['train', 'valid', 'test']

[Chromosome Distribution]
chromosome
chr1     69700
chr2     47226
chr6     39094
chr7     37484
chr3     35207
chr11    33419
chr17    32535
chr5     30597
chr12    29791
chr10    28885
chr19    28676
chr16    27199
chr8     26147
chr9     24720
chr4     23575
chr15    22422
chr20    18020
chr14    14440
chr22    14177
chrX     13148
chr18    10256
chr13     9624
chr21     8887
Name: count, dtype: int64

[Strand Distribution]
strand
+    625229
Name: count, dtype: int64

[Sequence Length Stats]
Min: 512, Max: 512, Mean: 512.00, Median: 512.0
⚠️ No HDF5 file found.
================================================================================
📊 DATASET: cpg_methylation
================================================================================

[Basic Info]
Total entries: 959039
Columns: ['chromosome', 'start', 'end', 'strand', 'label', 'split']
Splits: ['train', 'test', 'valid']

[Chromosome Distribution]
chromosome
chr2     89910
chr1     87674
chr17    61940
chr7     59751
chr3     56884
chr12    54240
chr6     51135
chr5     50685
chr10    48272
chr16    44941
chr11    44129
chr8     41862
chr9     41712
chr4     37675
chr15    37533
chr14    36334
chr19    34817
chr13    23283
chr18    19243
chr22    17320
chr21    10452
chrX      9224
chrY        23
Name: count, dtype: int64

[Strand Distribution]
strand
-    512737
+    446302
Name: count, dtype: int64

[Sequence Length Stats]
Min: 512, Max: 512, Mean: 512.00, Median: 512.0
⚠️ No HDF5 file found.
================================================================================
📊 DATASET: disease_vep
================================================================================
❌ BED file not found: /media/data/BEND_backups/data/disease_vep/disease_vep.bed
================================================================================
📊 DATASET: expression_vep
================================================================================
❌ BED file not found: /media/data/BEND_backups/data/expression_vep/expression_vep.bed