script	keep_rows_conv	remove_rows_conv	total_rows_conv	keep_tokens_conv	remove_tokens_conv	total_tokens_conv	keep_disk_converted	remove_disk_converted	total_disk_converted	language_count	source
Latn	5.50B	439.42M	5.93B	4.29T	327.39B	4.61T	21.13TB	4.91TB	26.12TB	1830	fineweb-2,fineweb_en,mala,New CC
Cyrl	1.11B	85.84M	1.19B	1.26T	98.88B	1.36T	9.40TB	2.43TB	11.83TB	91	fineweb-2,mala,New CC
Hani	715.15M	71.29M	786.45M	746.48B	73.89B	820.36B	2.90TB	1.60TB	4.50TB	12	fineweb-2,mala,New CC
Jpan	491.47M	42.47M	533.93M	278.14B	22.81B	300.95B	2.00TB	504.87GB	2.50TB	1	fineweb-2,New CC
Arab	198.64M	20.36M	219.03M	122.36B	13.12B	135.48B	1.03TB	290.11GB	1.31TB	60	fineweb-2,mala,New CC
Hang	79.22M	6.16M	85.38M	59.07B	4.62B	63.69B	336.56GB	66.70GB	403.26GB	1	fineweb-2,mala,New CC
Grek	69.14M	5.90M	75.04M	58.45B	5.15B	63.60B	432.64GB	120.10GB	552.76GB	4	fineweb-2,mala,New CC
Deva	60.09M	5.79M	65.87M	30.63B	2.56B	33.19B	342.83GB	72.51GB	415.37GB	48	fineweb-2,mala,New CC
Thai	55.73M	4.34M	60.06M	46.36B	3.60B	49.96B	526.40GB	110.69GB	637.11GB	11	fineweb-2,mala,New CC
Mlym	39.16M	3.89M	43.05M	7.00B	559.61M	7.56B	94.53GB	18.86GB	113.40GB	6	fineweb-2,mala,New CC
Gujr	38.91M	4.55M	43.46M	5.07B	461.70M	5.53B	60.22GB	13.54GB	73.76GB	2	fineweb-2,mala,New CC
Knda	34.20M	2.70M	36.90M	4.76B	359.45M	5.12B	68.85GB	11.14GB	79.99GB	2	fineweb-2,mala,New CC
Hebr	26.99M	1.83M	28.82M	21.15B	1.38B	22.53B	152.34GB	30.80GB	183.18GB	6	fineweb-2,mala,New CC
Taml	26.65M	2.92M	29.56M	5.88B	461.60M	6.35B	80.38GB	19.44GB	99.82GB	2	fineweb-2,mala,New CC
Guru	24.04M	3.16M	27.21M	2.27B	227.69M	2.50B	26.71GB	8.65GB	35.36GB	2	fineweb-2,mala,New CC
Beng	21.91M	1.51M	23.42M	12.67B	875.46M	13.54B	148.42GB	31.39GB	179.83GB	6	fineweb-2,mala,New CC
Geor	20.56M	1.36M	21.92M	6.19B	419.61M	6.61B	83.04GB	15.75GB	98.81GB	3	fineweb-2,mala,New CC
Armn	17.24M	1.46M	18.70M	4.74B	407.38M	5.15B	42.47GB	11.43GB	53.93GB	4	fineweb-2,mala,New CC
Telu	9.93M	821.21K	10.75M	3.91B	295.72M	4.20B	48.22GB	9.65GB	57.87GB	4	fineweb-2,mala,New CC
Sinh	9.91M	1.12M	11.03M	2.93B	251.40M	3.18B	32.73GB	7.64GB	40.37GB	1	fineweb-2,mala,New CC
Orya	6.57M	616.98K	7.18M	464.57M	37.89M	502.46M	9.79GB	2.20GB	12.01GB	6	fineweb-2,mala
Ethi	6.41M	429.99K	6.85M	1.38B	91.75M	1.46B	12.66GB	2.92GB	15.59GB	13	fineweb-2,mala,New CC
Mymr	6.04M	479.44K	6.52M	5.30B	406.67M	5.72B	40.57GB	7.83GB	48.39GB	9	fineweb-2,mala,New CC
Kana	5.83M	1.11M	6.94M	1.13B	219.26M	1.35B	16.90GB	14.33GB	31.23GB	1	fineweb-2
Khmr	4.96M	380.38K	5.34M	2.24B	160.29M	2.40B	30.95GB	4.99GB	35.95GB	7	fineweb-2,mala,New CC
Bamu	4.71M	1.00M	5.71M	199.46M	42.49M	241.95M	79.67GB	19.47GB	99.14GB	1	fineweb-2
Copt	4.40M	361.99K	4.76M	219.04M	18.03M	237.09M	8.97GB	864.17MB	9.84GB	2	fineweb-2
Tang	3.94M	741.81K	4.68M	209.68M	39.47M	249.15M	22.70GB	7.67GB	30.36GB	1	fineweb-2
Xsux	3.90M	694.59K	4.59M	276.93M	49.35M	326.28M	13.84GB	9.74GB	23.58GB	1	fineweb-2
Laoo	3.46M	470.52K	3.92M	840.28M	87.36M	927.65M	11.85GB	3.95GB	15.80GB	5	fineweb-2,mala,New CC
Yiii	3.39M	417.38K	3.81M	232.88M	28.68M	261.56M	25.82GB	6.24GB	32.05GB	1	fineweb-2
Hira	2.78M	579.38K	3.36M	361.77M	75.28M	437.05M	4.87GB	4.04GB	8.91GB	1	fineweb-2
Thaa	2.51M	301.28K	2.82M	425.90M	45.08M	470.98M	4.75GB	1.28GB	6.02GB	2	fineweb-2,mala,New CC
Kits	1.86M	315.45K	2.17M	269.54M	45.75M	315.29M	12.47GB	17.12GB	29.58GB	1	fineweb-2
Hluw	1.71M	374.92K	2.09M	70.77M	15.47M	86.25M	3.19GB	3.45GB	6.64GB	1	fineweb-2
Japn	1.60M	177.40K	1.78M	148.77M	17.99M	166.76M	6.05GB	2.16GB	8.21GB	1	mala
Shrd	1.41M	216.59K	1.62M	130.80M	20.13M	150.93M	6.06GB	2.35GB	8.40GB	1	fineweb-2
Lina	1.37M	271.63K	1.64M	130.39M	25.87M	156.26M	6.97GB	3.85GB	10.82GB	1	fineweb-2
Samr	1.35M	158.99K	1.51M	64.06M	7.54M	71.59M	4.30GB	1.72GB	6.02GB	1	fineweb-2
Cans	1.24M	248.84K	1.49M	109.29M	21.66M	130.96M	3.55GB	2.78GB	6.33GB	12	fineweb-2,mala
Syrc	1.12M	116.18K	1.23M	44.70M	4.75M	49.44M	20.70GB	4.35GB	25.04GB	4	fineweb-2,mala
Adlm	1.12M	194.29K	1.32M	43.63M	7.55M	51.18M	1.10GB	853.95MB	1.95GB	1	fineweb-2
Egyp	1.12M	190.50K	1.31M	97.41M	16.58M	113.99M	2.54GB	3.52GB	6.05GB	1	fineweb-2
Mend	1.03M	293.72K	1.32M	16.58M	4.75M	21.33M	893.39MB	2.06GB	2.95GB	1	fineweb-2
Linb	735.07K	107.67K	842.75K	52.97M	7.76M	60.73M	6.30GB	997.90MB	7.30GB	1	fineweb-2
Brai	590.10K	125.33K	715.43K	57.85M	12.29M	70.13M	1.94GB	1.30GB	3.24GB	1	fineweb-2
Sgnw	567.29K	106.45K	673.74K	37.34M	7.01M	44.34M	1.40GB	1.11GB	2.50GB	1	fineweb-2
Tibt	544.99K	70.33K	615.32K	288.24M	33.57M	321.81M	4.50GB	1.53GB	6.09GB	4	fineweb-2,mala,New CC
Hung	520.10K	155.23K	675.33K	42.34M	12.64M	54.98M	1.94GB	2.32GB	4.25GB	1	fineweb-2
Mong	435.35K	61.47K	496.83K	119.66M	16.95M	136.62M	1.97GB	1.04GB	3.03GB	3	fineweb-2,mala
Bali	422.49K	77.08K	499.57K	39.62M	7.23M	46.84M	1.19GB	662.91MB	1.85GB	1	fineweb-2
Nshu	419.71K	89.40K	509.11K	38.53M	8.21M	46.74M	993.06MB	1.28GB	2.27GB	1	fineweb-2
Modi	386.82K	67.33K	454.15K	52.58M	9.15M	61.73M	16.45GB	7.42GB	23.87GB	1	fineweb-2
Lana	377.58K	110.80K	488.38K	47.55M	13.95M	61.50M	688.16MB	2.05GB	2.74GB	1	fineweb-2
Saur	315.78K	73.82K	389.60K	15.26M	3.57M	18.83M	398.55MB	489.07MB	887.62MB	1	fineweb-2
Dupl	258.90K	53.06K	311.96K	14.14M	2.90M	17.04M	752.58MB	502.95MB	1.26GB	1	fineweb-2
Runr	252.18K	39.00K	291.19K	154.68M	23.92M	178.61M	1.25GB	3.28GB	4.52GB	2	fineweb-2,mala
Vaii	243.47K	93.27K	336.73K	71.28M	27.31M	98.59M	513.30MB	1.88GB	2.39GB	1	fineweb-2
Glag	237.68K	72.07K	309.75K	20.38M	6.18M	26.56M	476.61MB	951.96MB	1.43GB	1	fineweb-2
Dsrt	198.00K	37.90K	235.90K	4.47M	855.49K	5.32M	248.83MB	562.92MB	811.75MB	1	fineweb-2
Mroo	186.14K	22.85K	208.99K	6.42M	788.69K	7.21M	2.43GB	335.38MB	2.77GB	1	fineweb-2
Bopo	181.71K	24.45K	206.16K	30.63M	4.12M	34.75M	3.45GB	890.68MB	4.35GB	1	fineweb-2
Mtei	175.69K	20.34K	196.03K	49.11M	5.76M	54.87M	805.36MB	574.03MB	1.38GB	2	fineweb-2,mala
Khar	153.37K	40.04K	193.41K	6.75M	1.76M	8.52M	250.30MB	182.38MB	432.67MB	1	fineweb-2
Brah	138.03K	22.72K	160.75K	7.85M	1.29M	9.15M	273.71MB	243.75MB	517.47MB	1	fineweb-2
Bhks	131.90K	27.03K	158.93K	3.93M	805.58K	4.74M	190.96MB	154.63MB	345.59MB	1	fineweb-2
Hmnp	118.87K	12.33K	131.20K	6.83M	708.37K	7.54M	436.28MB	151.81MB	588.09MB	1	fineweb-2
Phag	107.75K	17.58K	125.34K	3.41M	556.36K	3.97M	141.68MB	93.31MB	234.99MB	1	fineweb-2
Merc	107.52K	38.04K	145.56K	7.61M	2.69M	10.30M	215.43MB	472.23MB	687.66MB	1	fineweb-2
Kali	105.87K	24.33K	130.20K	1.39M	319.46K	1.71M	105.24MB	91.45MB	196.70MB	2	fineweb-2
Plrd	104.31K	21.07K	125.38K	5.47M	1.10M	6.57M	214.53MB	225.25MB	439.77MB	1	fineweb-2
Lisu	101.48K	20.06K	121.53K	24.00M	4.74M	28.74M	204.24MB	527.21MB	731.45MB	2	fineweb-2
Hmng	101.02K	23.34K	124.36K	5.37M	1.24M	6.61M	153.20MB	196.99MB	350.19MB	1	fineweb-2
Nkoo	98.77K	25.89K	124.65K	4.91M	1.07M	5.98M	2.13GB	233.87MB	2.36GB	2	fineweb-2,mala
Gran	97.96K	21.57K	119.53K	3.57M	785.93K	4.36M	135.27MB	243.90MB	379.18MB	1	fineweb-2
Gonm	94.82K	16.28K	111.10K	2.83M	486.36K	3.32M	106.89MB	142.16MB	249.05MB	1	fineweb-2
Cher	94.19K	25.99K	120.19K	9.12M	2.45M	11.57M	245.29MB	689.18MB	934.47MB	2	fineweb-2,mala
Tnsa	89.55K	17.93K	107.48K	3.28M	656.33K	3.93M	98.49MB	204.04MB	302.53MB	1	fineweb-2
Cprt	88.19K	14.11K	102.30K	7.87M	1.26M	9.13M	142.36MB	85.91MB	228.27MB	1	fineweb-2
Cari	77.73K	18.09K	95.82K	1.73M	401.78K	2.13M	89.37MB	76.01MB	165.38MB	1	fineweb-2
Diak	68.42K	22.40K	90.82K	2.87M	938.52K	3.81M	58.40MB	94.36MB	152.76MB	1	fineweb-2
Marc	67.80K	11.89K	79.69K	2.34M	410.50K	2.75M	66.51MB	95.34MB	161.85MB	1	fineweb-2
Mani	65.94K	9.56K	75.50K	6.27M	908.84K	7.17M	128.39MB	140.35MB	268.75MB	1	fineweb-2
Talu	65.77K	11.95K	77.72K	1.27M	231.55K	1.50M	78.51MB	62.21MB	140.72MB	2	fineweb-2,mala
Vith	65.14K	12.13K	77.28K	2.49M	464.49K	2.96M	124.41MB	95.26MB	219.66MB	1	fineweb-2
Nagm	63.57K	11.94K	75.51K	1.03M	193.45K	1.22M	58.20MB	73.87MB	132.08MB	1	fineweb-2
Ahom	60.21K	9.69K	69.90K	2.34M	376.34K	2.72M	127.53MB	70.68MB	198.21MB	1	fineweb-2
Java	58.52K	13.32K	71.84K	2.18M	496.30K	2.68M	66.55MB	116.13MB	182.68MB	1	fineweb-2
Palm	48.99K	5.32K	54.32K	424.13K	46.09K	470.22K	39.41MB	43.82MB	83.23MB	1	fineweb-2
Wara	46.80K	9.12K	55.92K	1.47M	286.76K	1.76M	58.48MB	52.76MB	111.24MB	1	fineweb-2
Olck	45.80K	4.06K	49.86K	6.69M	492.54K	7.19M	86.16MB	38.55MB	124.71MB	2	fineweb-2,mala
Khoj	39.85K	5.23K	45.09K	892.46K	117.20K	1.01M	43.07MB	40.20MB	83.27MB	1	fineweb-2
Rohg	35.21K	5.32K	40.53K	534.34K	80.72K	615.05K	36.76MB	41.06MB	77.82MB	1	fineweb-2
Sidd	34.75K	8.41K	43.16K	3.03M	732.80K	3.76M	46.06MB	93.44MB	139.51MB	1	fineweb-2
Yezi	33.92K	3.35K	37.27K	96.61K	9.53K	106.13K	29.36MB	14.31MB	43.67MB	1	fineweb-2
Ougr	32.34K	6.13K	38.47K	442.16K	83.82K	525.98K	31.03MB	37.95MB	68.98MB	1	fineweb-2
Avst	32.16K	6.62K	38.78K	1.75M	360.09K	2.11M	51.64MB	53.81MB	105.46MB	1	fineweb-2
Ital	32.06K	5.06K	37.12K	519.27K	81.93K	601.19K	34.30MB	29.24MB	63.53MB	1	fineweb-2
Wcho	31.94K	6.51K	38.45K	1.48M	301.04K	1.78M	58.25MB	74.54MB	132.79MB	1	fineweb-2
Kthi	31.07K	5.44K	36.51K	763.52K	133.75K	897.27K	30.79MB	35.73MB	66.52MB	1	fineweb-2
Tavt	30.95K	3.63K	34.57K	670.82K	78.65K	749.47K	29.30MB	14.97MB	44.26MB	1	fineweb-2
Takr	30.70K	5.29K	35.99K	1.73M	298.02K	2.03M	30.89MB	45.59MB	76.48MB	1	fineweb-2
Tfng	29.84K	3.34K	33.18K	1.42M	148.55K	1.57M	35.12MB	24.87MB	59.99MB	4	fineweb-2
Tale	26.17K	2.80K	28.98K	220.84K	23.64K	244.48K	23.80MB	16.84MB	40.64MB	1	fineweb-2
Elba	24.86K	4.61K	29.48K	394.51K	73.22K	467.73K	24.19MB	19.19MB	43.38MB	1	fineweb-2
Zanb	24.46K	4.76K	29.21K	327.39K	63.68K	391.07K	26.07MB	40.03MB	66.10MB	1	fineweb-2
Sogo	22.29K	3.88K	26.16K	146.13K	25.41K	171.54K	17.82MB	20.07MB	37.89MB	1	fineweb-2
Soyo	22.21K	4.91K	27.12K	598.89K	132.47K	731.36K	25.04MB	36.77MB	61.81MB	1	fineweb-2
Dogr	21.29K	3.82K	25.11K	1.28M	229.94K	1.51M	29.94MB	23.89MB	53.84MB	1	fineweb-2
Kawi	20.28K	4.10K	24.38K	396.57K	80.26K	476.83K	20.90MB	24.30MB	45.20MB	1	fineweb-2
Phli	19.16K	2.88K	22.04K	41.16K	6.19K	47.35K	17.52MB	7.60MB	25.13MB	1	fineweb-2
Cham	17.92K	3.60K	21.52K	762.24K	153.32K	915.57K	21.12MB	39.91MB	61.03MB	1	fineweb-2
Nbat	17.61K	3.19K	20.80K	280.13K	50.76K	330.89K	18.90MB	15.97MB	34.87MB	1	fineweb-2
Nand	17.39K	3.36K	20.75K	307.12K	59.32K	366.44K	17.76MB	19.20MB	36.96MB	1	fineweb-2
Osma	16.98K	2.59K	19.57K	495.54K	75.61K	571.15K	19.16MB	15.11MB	34.27MB	1	fineweb-2
Sind	14.81K	4.24K	19.05K	315.61K	90.31K	405.93K	21.16MB	18.70MB	39.86MB	1	fineweb-2
Sogd	14.52K	2.73K	17.24K	307.50K	57.79K	365.30K	14.67MB	9.73MB	24.40MB	1	fineweb-2
Pauc	13.23K	4.28K	17.50K	1.88M	609.43K	2.49M	13.65MB	33.03MB	46.67MB	1	fineweb-2
Sylo	12.42K	2.88K	15.29K	922.71K	213.86K	1.14M	22.76MB	22.23MB	44.99MB	1	fineweb-2
Goth	11.84K	1.24K	13.08K	191.30K	19.67K	210.97K	11.59MB	3.62MB	15.22MB	2	fineweb-2,mala
Rjng	10.30K	2.36K	12.65K	595.51K	136.27K	731.78K	9.43MB	15.02MB	24.45MB	1	fineweb-2
Chrs	10.24K	1.26K	11.50K	45.98K	5.66K	51.64K	8.22MB	5.45MB	13.67MB	1	fineweb-2
Phlp	9.08K	2.03K	11.11K	31.62K	7.06K	38.69K	8.35MB	5.61MB	13.96MB	1	fineweb-2
Mand	8.73K	1.49K	10.21K	82.87K	14.11K	96.98K	9.07MB	5.24MB	14.31MB	1	fineweb-2
Tglg	8.58K	1.88K	10.46K	638.75K	140.15K	778.89K	11.22MB	10.89MB	22.11MB	1	fineweb-2
Shaw	8.41K	1.28K	9.69K	915.43K	139.72K	1.06M	13.65MB	12.62MB	26.27MB	1	fineweb-2
Hatr	7.44K	1.63K	9.07K	371.48K	81.61K	453.09K	10.15MB	13.53MB	23.68MB	1	fineweb-2
Bugi	7.03K	1.33K	8.36K	95.81K	18.11K	113.91K	6.90MB	6.18MB	13.09MB	2	fineweb-2,mala
Tagb	6.58K	1.14K	7.72K	30.92K	5.37K	36.30K	5.84MB	2.33MB	8.17MB	1	fineweb-2
Prti	6.05K	1.09K	7.15K	225.93K	40.79K	266.72K	7.31MB	4.57MB	11.89MB	1	fineweb-2
Narb	5.22K	835.0	6.06K	56.09K	8.97K	65.06K	6.01MB	7.12MB	13.13MB	1	fineweb-2
Sarb	4.99K	874.0	5.86K	170.46K	29.86K	200.31K	6.93MB	15.95MB	22.87MB	1	fineweb-2
Ugar	4.85K	653.0	5.50K	133.05K	17.92K	150.97K	4.03MB	2.47MB	6.50MB	1	fineweb-2
Lydi	4.59K	1.03K	5.62K	28.08M	6.29M	34.37M	77.22MB	70.99MB	148.21MB	1	fineweb-2
Buhd	3.16K	448.0	3.61K	7.77K	1.10K	8.87K	2.73MB	623.88KB	3.35MB	1	fineweb-2
Perm	2.87K	630.0	3.50K	19.17K	4.20K	23.37K	2.58MB	1.36MB	3.94MB	1	fineweb-2
Elym	1.66K	496.0	2.16K	61.25K	18.28K	79.53K	1.88MB	7.52MB	9.40MB	1	fineweb-2
Limb	59.0	15.0	74.0	32.32K	8.22K	40.53K	754.75KB	229.80KB	984.54KB	1	fineweb-2
Zyyy	17.0	8.0	25.0	2.27K	1.07K	3.33K	25.50KB	37.52KB	63.03KB	1	mala