import re
import random
import string


pattern = re.compile("FROM(.*)WHERE")

# table_pattern = re.compile('\b(\w+)\s+AS\s+(\w+)\b')

countries = ["International", "USA", "UK", "Canada", "India", "Brazil",
             "Japan", "Denmark", "Italy", "Australia", "Greece", "China",
             "Portugal", "Mexico", "Germany", "Russia", "France", "Sweden",
             "Indonesia", "Pakistan", "Nigeria", "Bangladesh", "Ethiopia",
             "Philippines", "Egypt", "Vietnam", "DR Congo", "Turkey",
             "Iran", "Thailand", "South Africa", "Myanmar", "Colombia",
             "Spain", "Ukraine", "Tanzania", "Argentina", "Kenya", "Poland",
             "Algeria", "Uganda", "Morocco", "Iraq", "Sudan", "Peru",
             "Malaysia", "Uzbekistan", "Saudi Arabia", "Nepal", "Venezuela",
             "Ghana", "Yemen", "Afghanistan", "Mozambique", "Cameroon",
             "Côte d'Ivoire", "Madagascar", "North Korea", "Niger",
             "Sri Lanka", "Romania", "Mali", "Syria", "Malawi", "Kazakhstan",
             "Zambia", "Zimbabwe", "Chile", "Guatemala", "Ecuador", "Senegal",
             "Cambodia", "Chad", "Somalia", "Rwanda", "Guinea", "Benin",
             "Burundi", "Tunisia", "Bolivia", "Haiti", "Dominican Republic",
             "Czech Republic", "Hungary", "Belarus", "Azerbaijan",
             "United Arab Emirates", "Honduras", "Switzerland", "Israel",
             "Tajikistan", "Austria", "Papua New Guinea", "Sierra Leone",
             "Hong Kong", "Libya", "Laos", "Paraguay", "Jordan", "Serbia",
             "Singapore", "Denmark", "Finland", "Slovakia", "Norway", "Liberia",
             "Ireland"]

companies = [
    "Walt Disney Studios",
    "Warner Bros. Pictures",
    "Universal Pictures",
    "Paramount Pictures",
    "Sony Pictures Entertainment",
    "20th Century Fox",
    "Lionsgate",
    "MGM Studios",
    "DreamWorks Animation",
    "Legendary",
    "New Line Cinema",
    "A24",
    "Focus Features",
    "Miramax",
    "STX",
    "Studio Ghibli",
    "Blumhouse Productions",
    "Pixar Animation Studios",
    "Working Title Films",
    "Amblin Partners",
    "Amblin Television",
    "Columbia Pictures",
    "DreamWorks Pictures",
    "20th Century Studios",
    "Lionsgate Films",
    "Miramax Films",
    "Summit Entertainment",
    "Fox Searchlight Pictures",
    "Paramount Animation",
    "Focus Features",
    "A24 Films",
    "STX Entertainment",
    "Sony Pictures Animation",
    "New Regency Productions",
    "Constantin Film",
    "Village Roadshow Pictures",
    "Lakeshore Entertainment",
    "Lighthouse Pictures",
    "Screen Gems",
    "Participant Media",
    "Imagine Entertainment",
    "EuropaCorp",
    "Lionsgate UK",
    "FilmDistrict",
    "Entertainment One",
    "The Weinstein Company",
    "Roadside Attractions",
    "Broad Green Pictures",
    "Lava Bear Films",
    "Aviron Pictures",
    "Relativity Media",
    "Lantern Entertainment",
    "Neon",
    "STX Films",
    "Bleecker Street",
    "Vertical Entertainment",
    "Magnolia Pictures",
    "Gravitas Ventures",
    "CBS Films",
    "Well Go USA Entertainment",
    "High Top Releasing",
    "Wrekin Hill Entertainment"
]

character_names = [
    "James Bond",
    "Luke Skywalker",
    "Harry Potter",
    "Batman",
    "Spider-Man",
    "Darth Vader",
    "Superman",
    "Hermione Granger",
    "Indiana Jones",
    "Rocky Balboa",
    "Captain Jack Sparrow",
    "The Joker",
    "Katniss Everdeen",
    "Ellen Ripley",
    "Neo",
    "Frodo Baggins",
    "Iron Man",
    "Captain America",
    "Wonder Woman",
    "Wolverine",
    "Tony Stark",
    "Sherlock Holmes",
    "John Wick",
    "Hannibal Lecter",
    "The Terminator",
    "Gandalf",
    "Jason Bourne",
    "John McClane",
    "James T. Kirk",
    "Ellen Ripley",
    "Ethan Hunt",
    "Beatrix Kiddo",
    "Jeffrey Lebowski",
    "Maximus Decimus Meridius",
    "Forrest Gump",
    "Jack Sparrow",
    "Vito Corleone",
    "Clarice Starling",
    "Atticus Finch",
    "Marty McFly",
    "Tony Montana",
    "Donnie Darko",
    "T-800",
    "James P. Sullivan",
    "R2-D2",
    "Yoda",
    "Shrek",
    "Neo",
    "Ellen Ripley",
    "Mr. Bean",
    "Joker",
    "Aragorn",
    "Katniss Everdeen",
    "Hermione Granger",
    "Dracula",
    "Edward Scissorhands",
    "Don Corleone",
    "Rambo",
    "Rocky Balboa",
    "E.T.",
    "Jack Torrance",
    "Indiana Jones",
    "James Franco",
    "Iron Man",
    "Captain America",
    "Thor",
    "Black Widow",
    "Hulk",
    "Doctor Strange",
    "Wonder Woman",
    "Aquaman",
    "The Flash",
    "Harley Quinn",
    "Deadpool",
    "Groot",
    "Star-Lord",
    "Gamora",
    "Spider-Man",
    "Black Panther",
    "Captain Marvel",
    "Baby Groot",
    "Hagrid",
    "Legolas",
    "Jack Sparrow",
    "Willy Wonka",
    "Sherlock Holmes",
    "Katniss Everdeen",
    "John Wick",
    "Don Vito Corleone",
    "Tony Montana",
    "Dr. Hannibal Lecter",
    "Forrest Gump",
    "The Dude",
    "Maximus Decimus Meridius",
    "Rocky Balboa",
    "Tony Stark",
    "Jason Bourne",
    "Marty McFly",
    "Atticus Finch",
    "Luke Skywalker"
]


country_codes = [
    "hn", "tg", "ag", "ng", "ae", "cl", "kz", "me", "sg", "mu", "bo", "gr", "sn", "sm", "id", "kn", "nr",
    "py", "hr", "vg", "ci", "rs", "az", "bn", "ch", "bs", "es", "mh", "za", "ad", "be", "mq", "an", "aw",
    "tz", "vn", "cz", "lv", "cm", "cshh", "ml", "mz", "tw", "sj", "sk", "jp", "ly", "qa", "pk", "do", "lt",
    "il", "pa", "iq", "kw", "ma", "tf", "so", "zm", "ua", "ge", "ru", "gf", "jo", "cu", "af", "co", "gl",
    "om", "ke", "sr", "ne", "cd", "is", "nz", "ga", "mr", "pl", "hk", "zw", "si", "li", "gd", "lu", "th",
    "ro", "it", "bi", "um", "mk", "ai", "bf", "ba", "sd", "td", "us", "no", "nl", "mn", "gh", "gy", "ee",
    "ve", "nc", "kr", "mm", "kg", "la", "mv", "tj", "my", "gt", "mx", "vi", "ky", "ki", "im", "yucs", "eg",
    "pf", "ao", "se", "ir", "mc", "mg", "rw", "na", "ddde", "lk", "ca", "ie", "tl", "va", "cg", "pr", "at",
    "tv", "tn", "dz", "pg", "br", "cv", "fo", "sz", "lr", "gu", "md", "et", "ls", "lb", "gw", "mt", "uy",
    "fj", "sa", "au", "dm", "kh", "fr", "gn", "lc", "bz", "gp", "gb", "ec", "fi", "kp", "bd", "bt", "mo",
    "tm", "ye", "sv", "ug", "gi", "am", "pt", "er", "by", "ps", "bh", "bj", "gg", "xyu", "de", "tt", "in",
    "bl", "bb", "pm", "uz", "suhh", "sl", "ht", "tr", "tk", "to", "jm", "al", "sy", "np", "dk", "cy", "as",
    "ar", "pe", "hu", "ni", "ph", "bm", "bw", "cr", "bg", "cn", "je"
]

company_kinds = ["distributors", "production companies", "special effects companies", "miscellaneous companies"]

info_types = [
    "books", "soundtrack", "LD quality of source", "studios", "LD sharpness", "LD picture format",
    "countries", "weekend gross", "crazy credits", "certificates", "agent address", "sound mix",
    "quotes", "LD audio quality", "LD close captions-teletext-ld-g", "locations", "taglines", "spouse",
    "LD contrast", "LD disc format", "death date", "birth notes", "screenplay-teleplay",
    "LD number of chapter stops", "LD additional information", "where now", "LD number of sides",
    "LD review", "production dates", "alternate versions", "copyright holder", "death notes",
    "LD production country", "LD spaciality", "bottom 10 rank", "interviews", "salary history",
    "goofs", "essays", "birth name", "votes", "LD dialogue intellegibility", "LD group genre",
    "LD color information", "printed media reviews", "LD original title", "LD year", "LD video artifacts",
    "LD length", "votes distribution", "LD video quality", "LD color rendition", "other literature",
    "opening weekend", "LD audio noise", "rating", "LD catalog number", "LD sound encoding",
    "LD aspect ratio", "pictorial", "LD release country", "book", "LD certification", "keywords",
    "languages", "birth date", "LD laserdisc title", "LD dynamic range", "mini biography",
    "magazine cover photo", "LD release date", "biographical movies", "mpaa", "release dates",
    "LD digital sound", "height", "LD master format", "LD supplement", "budget", "filming dates",
    "LD label", "adaption", "LD video noise", "gross", "other works", "genres", "plot", "novel",
    "LD analog left", "LD pressing plant", "LD official retail price", "trivia", "LD disc size",
    "article", "portrayed in", "production process protocol", "LD subtitles", "LD quality program",
    "color info", "admissions", "runtimes", "LD status of availablility", "LD video standard",
    "nick names", "rentals", "LD number", "LD language", "LD frequency response", "trade mark",
    "top 250 rank", "LD analog right", "LD category", "tech info"
]

keywords = [
    "love", "death", "friendship", "family", "romance", "murder", "revenge", "adventure", "supernatural",
    "betrayal", "drama", "action", "comedy", "fantasy", "war", "crime", "thriller", "mystery", "hero",
    "tragedy", "redemption", "suspense", "magic", "power", "identity", "destiny", "sacrifice", "discovery",
    "treasure", "rebellion", "rivalry", "transformation", "corruption", "quest", "passion", "rescue", "justice",
    "dreams", "vengeance", "escape", "secrets", "futuristic", "evil", "space", "music", "chase", "horror",
    "survival", "insanity", "desire", "betray", "betrayed", "spy", "paranormal", "monster", "future", "police",
    "alien", "escape", "fight", "gang", "obsession", "dream", "detective", "prison", "history", "betraying",
    "killer", "investigation", "journey", "destined", "murdered", "warfare", "assassin", "mafia", "resurrected",
    "outlaw", "space", "horror", "secret", "terror", "magical", "investigate", "warrior", "revolution",
    "battle", "danger", "vengeful", "escape", "romantic", "terrorist", "revengeful", "martial", "fear",
    "mysterious", "hunted", "fantasy", "space", "epic"]

kinds = ["movie", "tv series", "tv movie", "video movie", "tv mini series", "video game", "episode"]

links = ["follows", "followed by", "remake of", "remade as", "references", "referenced in",
         "spoofs", "spoofed in", "features", "featured in", "spin off from", "spin off",
         "version of", "similar to", "edited into", "edited from",
         "alternate language version of", "unknown link"]

languages = [
    "English", "Japanese", "Bulgarian", "Czech", "Hungarian", "Serbo-Croatian", "Italian", "Spanish",
    "Portuguese", "Finnish", "German", "Danish", "Russian", "Greek", "Korean", "Dutch", "Serbian", "Slovak",
    "Latvian", "French", "Hebrew", "Norwegian", "Hindi", "Latin", "Tagalog", "Chinese", "Albanian", "Mandarin",
    "Croatian", "Filipino", "Chechen", "Cantonese", "Vietnamese", "Polish", "Turkish", "Welsh", "Catalan",
    "None", "Gujarati", "Arabic", "Swedish", "Romanian", "Greenlandic", "Irish Gaelic", "Slovenian", "Galician",
    "Indonesian", "Afrikaans", "Zulu", "Icelandic", "Swiss German", "Yiddish", "Tlingit", "Tamil", "Inuktitut",
    "Estonian", "Faroese", "Brazilian Sign Language", "Maori", "Macedonian", "Kyrgyz", "Rotuman", "Persian",
    "Ladino", "Mongolian", "Kazakh", "Ukrainian", "Kalmyk-Oirat", "Tatar", "Bosnian", "Urdu", "Flemish", "Dari",
    "Georgian"
]

intervals = ["2.0", "4.0", "6.0", "8.0", "10.0"]

genders = ['m', 'f']

roles = [
    "actor", "actress", "producer", "writer", "cinematographer", "composer",
    "costume designer", "director", "editor", "miscellaneous crew"
]

person_info_notes = [
    "Eva Redpath", "Katy Monahan Huntley", "Norman George", "Emily Weber", "Kyle Lochner",
    "Adam Showtime Palmer", "Mike Martin", "Famous Black Raincoat", "Rodolfo Alvarez", "Reef",
    "Thasc", "IW", "BEF", "Mike Sutton", "Be Free Films", "Jillian Thomas", "Jade SinClair PR",
    "Steven D. Snyder", "Erin Bates", "R.E. Wells", "Craig Good", "Crystal Agents", "Felicia D. Henderson",
    "Merchant Phil", "Steve Harrison", "Charlie Todd", "HH", "Sean Boyle", "Rod Glenn", "Cindy Faith",
    "Eric Volkman", "Federica Stradi", "Edward Bosco", "peacham", "Tomas Street", "Rafael Gomez",
    "Timothy Evans", "Samson Yee", "colin theys", "MF", "Samantha Mashaw", "ALO", "PRW", "MEC", "Volker Boehm",
    "Galan Inc. Television/Film", "Don Mc Kay", "Juliane Block", "alpowell", "Tom Choi", "Hannah Nicholas",
    "Paul Hughes", "Eldorado", "Jay Dee", "Kara Stevenson, Journalist", "Chris Kolaskos", "Tristan Cowen"
]

titles = [
    "love", "time", "night", "day", "man", "woman", "world", "life", "story", "city",
    "star", "house", "heart", "death", "road", "girl", "boy", "car", "door", "dream",
    "ring", "gun", "island", "game", "tree", "river", "money", "sword", "key", "book",
    "box", "window", "chair", "train", "boat", "ship", "treasure", "diamond", "letter", "picture",
    "mirror", "mask", "clock", "doll", "apple", "starship", "spaceship", "bottle", "suitcase", "camera",
    "pen", "guitar", "violin", "microphone", "drum", "knife", "fork", "spoon", "shoe", "hat",
    "dress", "suit", "mask", "helmet", "glove", "shield", "hammer", "shield", "potion", "potion",
    "flower", "sword", "crown", "wand", "crystal", "stone", "fire", "water", "ice", "wind",
    "thunder", "light", "darkness", "dream", "nightmare", "quest", "battle", "journey", "mission", "secret",
    "legend", "mystery", "code", "curse", "prophecy", "spell", "adventure", "comedy", "drama", "thriller"
]

comp_cast_types = ["cast", "crew", "complete", "complete+verified"]

alphabetics = list(string.ascii_lowercase + string.ascii_uppercase)

predicates = {
    "aka_name": [("name", "like", alphabetics, 2)],
    "aka_title": [("title", "like", countries, 2)],
    "char_name": [("name", "like", character_names, 3)],
    "comp_cast_type": [("kind", "in", comp_cast_types, 5)],
    "company_name": [("name", "like", companies, 2), ("country_code", "in", country_codes, 10)],
    "company_type": [("kind", "in", company_kinds, 3)],
    "info_type": [("info", "in", info_types, 20)],
    "keyword": [("keyword", "like", keywords, 2)],
    "kind_type": [("kind", "in", kinds, 20)],
    "link_type": [("link", "in", links, 20)],
    "movie_companies": [("note", "like", countries, 3)],
    "movie_info": [("info", "in", languages, 12)],
    "movie_info_idx": [("info", "compare", intervals, 1)],
    "name": [("name", "like", alphabetics, 3), ("gender", "in", genders, 1)],
    "person_info": [("note", "like", person_info_notes, 3)],
    "role_type": [("role", "like", person_info_notes, 2)],
    "title": [("title", "like", titles, 3), ("production_year", "between", 2)]
}

random.seed(100)

print(alphabetics)

import os
import pymonetdb
from tqdm import tqdm

connection = pymonetdb.connect(username="monetdb", password="monetdb", hostname="localhost", database="imd_noidx")
cursor = connection.cursor()

for i in tqdm(range(1, 34)):
    for repeat_time in range(100):
        if os.path.exists(f"result/q{i}_{repeat_time}.json"):
            continue
        while True:
            with open(f"query_template/{i}.sql", "r") as f:
                sql = f.read().strip()
                sql_single_line = sql.replace("\n", " ")
                matches = pattern.findall(sql_single_line)
                if matches:
                    tables_str = matches[0].strip()
                table_aliases = re.findall(r'\b(\w+)\s+AS\s+(\w+)\b', tables_str)
                unary_tables = []
                for table_alias in table_aliases:
                    if table_alias[0] in predicates:
                        unary_tables.append((table_alias[0], table_alias[1]))
                number_of_unary_table = len(unary_tables)
                # print(f"{repeat_time} number_of_unary_table:", number_of_unary_table)
                select_nr_table = random.randint(1, number_of_unary_table)
                select_unary_table_pairs = random.sample(unary_tables, k=select_nr_table)
                # print(f"{repeat_time} number_select_unary_tables:", len(select_unary_tables))
                unary_table_predicates = []
                for select_unary_table_pair in select_unary_table_pairs:
                    # print(select_unary_table)
                    select_unary_table, select_unary_alias_table = select_unary_table_pair
                    current_unary_table_predicates = predicates[select_unary_table]
                    nr_current_unary_table = len(current_unary_table_predicates)
                    select_nr_current_unary_table = random.randint(1, nr_current_unary_table)
                    select_current_unary_table_predicates = random.sample(current_unary_table_predicates, k=select_nr_current_unary_table)
                    # print(current_unary_table_predicates)
                    for select_current_unary_table_predicate in select_current_unary_table_predicates:
                        # print(select_current_unary_table_predicate)
                        column = select_current_unary_table_predicate[0]
                        if select_current_unary_table_predicate[1] == "like":
                            candidate_list = select_current_unary_table_predicate[2]
                            max_nr_candidate = select_current_unary_table_predicate[3]
                            nr_candidates = random.randint(1, max_nr_candidate)
                            inverse = False
                            if random.randint(1, 4) == 4:
                                inverse = True
                            # print("candidate_list:", candidate_list)
                            select_candidates = random.sample(candidate_list, k=nr_candidates)
                            # print("select_candidates:", select_candidates)
                            select_like_candidates = [f"'%{select_candidate}%'" for select_candidate in select_candidates]
                            if inverse:
                                select_like_candidates = [f"{select_unary_alias_table}.{column} not like {select_like_candidate}" for select_like_candidate in select_like_candidates]
                                unary_table_predicate = " AND ".join(select_like_candidates)
                            else:
                                select_like_candidates = [f"{select_unary_alias_table}.{column} like {select_like_candidate}" for select_like_candidate in select_like_candidates]
                                unary_table_predicate = "(" + " OR ".join(select_like_candidates) + ")"
                                # print(select_like_candidates)
                            # print("unary_table_predicate:", unary_table_predicate)
                            unary_table_predicates.append(unary_table_predicate)
                        elif select_current_unary_table_predicate[1] == "in":
                            candidate_list = select_current_unary_table_predicate[2]
                            max_nr_candidate = select_current_unary_table_predicate[3]
                            nr_candidates = random.randint(2, max_nr_candidate + 1)
                            select_candidates = random.sample(candidate_list, k=min(nr_candidates, len(candidate_list) - 1))
                            select_candidates = [f"'{select_candidate}'" for select_candidate in select_candidates]
                            unary_table_predicate = f"{select_unary_alias_table}.{column} in (" + ",".join(select_candidates) + ")"
                            if random.randint(1, 4) == 4:
                                unary_table_predicate = f"{select_unary_alias_table}.{column} not in (" + ",".join(
                                    select_candidates) + ")"
                            # print("unary_table_predicate:", unary_table_predicate)
                            unary_table_predicates.append(unary_table_predicate)
                        elif select_current_unary_table_predicate[1] == "between":
                            if random.randint(1, 3) == 1:
                                unary_table_predicate = f"{select_unary_alias_table}.{column} < {random.randint(1990, 2012)}"
                            elif random.randint(1, 3) == 2:
                                unary_table_predicate = f"{select_unary_alias_table}.{column} > {random.randint(1990, 2012)}"
                            else:
                                a = random.randint(1990, 2010)
                                b = random.randint(a, 2012)
                                unary_table_predicate = f"{select_unary_alias_table}.{column} > {a} AND  {select_unary_alias_table}.{column} < {b}"
                            unary_table_predicates.append(unary_table_predicate)
                        elif select_current_unary_table_predicate[1] == "compare":
                            candidate_list = select_current_unary_table_predicate[2]
                            max_nr_candidate = select_current_unary_table_predicate[3]
                            nr_candidates = 1
                            select_candidates = random.sample(candidate_list, k=nr_candidates)
                            if random.randint(1, 2) == 1:
                                unary_table_predicate = f"{select_unary_alias_table}.{column} < '{select_candidates[0]}'"
                            else:
                                unary_table_predicate = f"{select_unary_alias_table}.{column} > '{select_candidates[0]}'"
                            # print(unary_table_predicate)
                            unary_table_predicates.append(unary_table_predicate)
                        else:
                            print(select_current_unary_table_predicate[1])
                            raise ValueError("This is a custom error message.")

                unary_table_predicate_sql = "\n  AND ".join(unary_table_predicates)
                # print("unary_table_predicate_sql:", unary_table_predicate_sql)
                total_sql = f"{sql} \n  AND {unary_table_predicate_sql}"
                # invoke total_sql
                sql_statement = total_sql
                cursor.execute(sql_statement)
                cardinality = cursor.fetchone()[0]
                print(cardinality)
                if cardinality is not None:
                    with open(f"result2/q{i}_{repeat_time}.sql", "w") as fw:
                        fw.write(total_sql)
                    break


