import geopandas as gpd
import pycountry

# # Load the built-in dataset of world geometries
# world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# # Example language-to-country mapping (use your actual data here)
language_to_country = {
    "EN-GB": ["GBR"], "EN-US": ["USA"], "FR": ["FRA"], "DE": ["DEU"], "ES": ["ESP"], 
    "RU": ["RUS"], "ZH": ["CHN"], "JA": ["JPN"], "IT": ["ITA"], "AR": ["SAU", "DZA", "EGY"],
    "BG": ["BGR"], "CS": ["CZE"], "DA": ["DNK"], "EL": ["GRC"], "ET": ["EST"], 
    "FI": ["FIN"], "HU": ["HUN"], "ID": ["IDN"], "KO": ["KOR"], "LT": ["LTU"], 
    "LV": ["LVA"], "NB": ["NOR"], "NL": ["NLD"], "PL": ["POL"], "PT-BR": ["BRA"], 
    "PT-PT": ["PRT"], "RO": ["ROU"], "SK": ["SVK"], "SL": ["SVN"], "SV": ["SWE"], 
    "TR": ["TUR"], "UK": ["UKR"]
}

# # Flatten the list of all ISO codes currently mapped to languages
# mapped_iso_codes = {iso for isos in language_to_country.values() for iso in isos}

# # Find countries in the world dataset that are not mapped
# unmapped_countries = world[~world['iso_a3'].isin(mapped_iso_codes)]

# # Extract and print the names of these unmapped countries
# unmapped_country_names = unmapped_countries['name'].tolist()
# print("Countries not currently mapped:")
# print(unmapped_country_names)

# country_to_language = {
#     "Fiji": "English",
#     "Tanzania": "Swahili",
#     "W. Sahara": "Hassaniya Arabic",
#     "Canada": ["English", "French"],
#     "Kazakhstan": "Kazakh",
#     "Uzbekistan": "Uzbek"
# }




country_to_language = {
    "Fiji": ["English", "Fijian", "Hindi"],
    "Tanzania": ["Swahili", "English"],
    "W. Sahara": "Hassaniya Arabic",
    "Canada": ["English", "French"],
    "Kazakhstan": "Kazakh",
    "Uzbekistan": "Uzbek",
    "Papua New Guinea": ["English", "Tok Pisin", "Hiri Motu"],
    "Argentina": "Spanish",
    "Chile": "Spanish",
    "Dem. Rep. Congo": ["French", "Lingala", "Kiswahili", "Kikongo", "Tshiluba"],
    "Somalia": ["Somali", "Arabic"],
    "Kenya": ["English", "Swahili"],
    "Sudan": ["Arabic", "English"],
    "Chad": "French",
    "Haiti": ["French", "Haitian Creole"],
    "Dominican Rep.": "Spanish",
    "Bahamas": "English",
    "Falkland Is.": "English",
    "Greenland": "Kalaallisut",
    "Fr. S. Antarctic Lands": "French",
    "Timor-Leste": ["Tetum", "Portuguese"],
    "South Africa": ["Zulu", "Xhosa", "Afrikaans", "English", "Northern Sotho", "Tswana", "Sotho", "Tsonga", "Swati", "Venda", "Ndebele"],
    "Lesotho": ["Sesotho", "English"],
    "Mexico": "Spanish",
    "Uruguay": "Spanish",
    "Bolivia": ["Spanish", "Quechua", "Aymara"],
    "Peru": ["Spanish", "Quechua", "Aymara"],
    "Colombia": "Spanish",
    "Panama": "Spanish",
    "Costa Rica": "Spanish",
    "Nicaragua": "Spanish",
    "Honduras": "Spanish",
    "El Salvador": "Spanish",
    "Guatemala": "Spanish",
    "Belize": ["English", "Spanish"],
    "Venezuela": "Spanish",
    "Guyana": "English",
    "Suriname": "Dutch",
    "Ecuador": "Spanish",
    "Puerto Rico": ["Spanish", "English"],
    "Jamaica": "English",
    "Cuba": "Spanish",
    "Zimbabwe": ["English", "Shona", "Ndebele"],
    "Botswana": ["English", "Tswana"],
    "Namibia": ["English"],
    "Senegal": "French",
    "Mali": "French",
    "Mauritania": "Arabic",
    "Benin": "French",
    "Niger": "French",
    "Nigeria": ["English"],
    "Cameroon": ["English", "French"],
    "Togo": "French",
    "Ghana": ["English"],
    "Côte d'Ivoire": "French",
    "Guinea": "French",
    "Guinea-Bissau": ["Portuguese", "Upper Guinea Creole"],
    "Liberia": "English",
    "Sierra Leone": ["English"],
    "Burkina Faso": "French",
    "Central African Rep.": "Sango",
    "Congo": "French",
    "Gabon": "French",
    "Eq. Guinea": ["Spanish", "French", "Portuguese"],
    "Zambia": ["English"],
    "Malawi": ["English", "Chichewa"],
    "Mozambique": "Portuguese",
    "eSwatini": ["English", "Swati"],
    "Angola": "Portuguese",
    "Burundi": ["Kirundi", "French"],
    "Israel": ["Hebrew", "Arabic"],
    "Lebanon": ["Arabic", "French"],
    "Madagascar": ["Malagasy", "French"],
    "Palestine": ["Arabic"],
    "Gambia": ["English"],
    "Tunisia": "Arabic",
    "Jordan": "Arabic",
    "United Arab Emirates": "Arabic",
    "Qatar": "Arabic",
    "Kuwait": "Arabic",
    "Iraq": "Arabic",
    "Oman": "Arabic",
    "Vanuatu": ["Bislama", "English", "French"],
    "Cambodia": "Khmer",
    "Thailand": "Thai",
    "Laos": "Lao",
    "Myanmar": "Burmese",
    "Vietnam": "Vietnamese",
    "North Korea": "Korean",
    "Mongolia": "Mongolian",
    "India": ["Hindi", "English"],
    "Bangladesh": "Bengali",
    "Bhutan": "Dzongkha",
    "Nepal": "Nepali",
    "Pakistan": ["Urdu", "English"],
    "Afghanistan": ["Pashto", "Dari"],
    "Tajikistan": "Tajik",
    "Kyrgyzstan": "Kyrgyz",
    "Turkmenistan": "Turkmen",
    "Iran": "Persian",
    "Syria": "Arabic",
    "Armenia": "Armenian",
    "Belarus": "Belarusian",
    "Austria": "German",
    "Moldova": "Romanian",
    "Albania": "Albanian",
    "Croatia": "Croatian",
    "Switzerland": ["German", "French", "Italian", "Romansh"],
    "Luxembourg": ["Luxembourgish", "French", "German"],
    "Belgium": ["Dutch", "French", "German"],
    "Ireland": ["English", "Irish"],
    "New Caledonia": "French",
    "Solomon Is.": "English",
    "New Zealand": ["English", "Maori"],
    "Australia": "English",
    "Sri Lanka": ["Sinhala", "Tamil"],
    "Taiwan": ["Mandarin"],
    "Iceland": "Icelandic",
    "Azerbaijan": "Azerbaijani",
    "Georgia": "Georgian",
    "Philippines": ["Filipino", "English"],
    "Malaysia": ["Malay"],
    "Brunei": ["Malay"],
    "Eritrea": ["Tigrinya", "Arabic", "English"],
    "Paraguay": ["Spanish", "Guarani"],
    "Yemen": "Arabic",
    "Antarctica": None,
    "N. Cyprus": "Turkish",
    "Cyprus": ["Greek", "Turkish"],
    "Morocco": "Arabic",
    "Libya": "Arabic",
    "Ethiopia": "Amharic",
    "Djibouti": ["French", "Arabic"],
    "Somaliland": "Somali",
    "Uganda": "English",
    "Rwanda": ["Kinyarwanda", "French", "English"],
    "Bosnia and Herz.": ["Bosnian", "Croatian", "Serbian"],
    "North Macedonia": "Macedonian",
    "Serbia": "Serbian",
    "Montenegro": "Montenegrin",
    "Kosovo": ["Albanian", "Serbian"],
    "Trinidad and Tobago": "English",
    "S. Sudan": ["English"],
}

# Convert language names to ISO 639-3 codes using pycountry with error handling
language_to_iso639 = {}
for country, languages in country_to_language.items():
    if isinstance(languages, list):
        for lang in languages:
            try:
                language_to_iso639[lang] = pycountry.languages.lookup(lang).alpha_3
            except LookupError:
                language_to_iso639[lang] = 'Not found'
    else:
        try:
            language_to_iso639[languages] = pycountry.languages.lookup(languages).alpha_3
        except LookupError:
            language_to_iso639[languages] = 'Not found'

print("ISO 639-3 codes for languages in unmapped countries:")
print(language_to_iso639)


unmapped_language_codes = [language_to_iso639[country] for country in language_to_iso639.keys() if country not in language_to_country and language_to_iso639[country] != 'Not found']
print("Languages in unmapped countries:")
print(unmapped_language_codes)

language_codes_iso639_3 = list(set(unmapped_language_codes))
# language_codes_iso639_3 = ['eng', 'fij', 'hin', 'fra', 'kaz', 'uzb', 'tpi', 'hmo', 'spa', 'lin', 'som', 'ara', 'kal', 'tet', 'por', 'zul', 'xho', 'afr', 'tsn', 'tso', 'ssw', 'ven', 'que', 'aym', 'nld', 'sna', 'sag', 'heb', 'mlg', 'bis', 'khm', 'tha', 'lao', 'mya', 'vie', 'kor', 'mon', 'ben', 'dzo', 'urd', 'prs', 'tgk', 'tuk', 'fas', 'hye', 'bel', 'deu', 'ron', 'sqi', 'hrv', 'ita', 'roh', 'ltz', 'gle', 'mri', 'sin', 'tam', 'isl', 'aze', 'kat', 'fil', 'tir', 'grn', 'tur', 'amh', 'kin', 'bos', 'srp', 'mkd', 'cnr']

iso639_1_mapping = {}

for code in language_codes_iso639_3:
    try:
        language = pycountry.languages.get(alpha_3=code)
        iso639_1_code = language.alpha_2
        iso639_1_mapping[code] = iso639_1_code
    except AttributeError:
        # Not all languages have an ISO 639-1 code
        iso639_1_mapping[code] = None

print("ISO 639-3 to ISO 639-1 mapping:")
for key, value in iso639_1_mapping.items():
    print(f"{key}: {value}")

print("ISO 639-1 codes for languages in unmapped countries:")
print(iso639_1_mapping)

# get only values out of dict to form a list
language_codes_iso639_1 = list(iso639_1_mapping.values())
print(language_codes_iso639_1)