import re

# Java keywords
java_keywords = [
    'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch', 'char',
    'class', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum',
    'extends', 'final', 'finally', 'float', 'for', 'goto', 'if', 'implements',
    'import', 'instanceof', 'int', 'interface', 'long', 'native', 'new',
    'package', 'private', 'protected', 'public', 'return', 'short', 'static',
    'strictfp', 'super', 'switch', 'synchronized', 'this', 'throw', 'throws',
    'transient', 'try', 'void', 'volatile', 'while', 'true', 'false', 'null'
]

# Operators and symbols
java_operators = [
    '+', '-', '*', '/', '%', '++', '--',
    '==', '!=', '>', '<', '>=', '<=',
    '&&', '||', '!', '=', '+=', '-=', '*=', '/=', '%=',
    '<<', '>>', '>>>', '<<=', '>>=', '>>>=',
    '&', '|', '^', '~', '&=', '|=', '^=',
    '?', ':', '->', '::'
]

java_symbols = ['(', ')', '{', '}', '[', ']', ';', ',', '.', '@']

# Character classification (Adapted to state machine)
character_dict_java = [
    ['+', '-', '*', '/', '%', '=', '!', '<', '>', '&', '|', '^', '~', '?', ':'],  # 0 operators
    ['(', ')', '{', '}', '[', ']', ';', ',', '.', '@'],                         # 1 symbols
    ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],                          # 2 digits
    [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + ['_', '$'],  # 3 letters and valid identifiers
    ['\"', '\''],                                                               # 4 string or char
    [' ', '\t'],                                                                # 5 whitespace
    ['\n'],                                                                     # 6 newline
]

def T_java(c):
    for i, group in enumerate(character_dict_java):
        if c in group:
            return i
    return -1

lexical_state_table_java = [
    [1, 2, 3, 4, 5, 0, 0],  # 0: start
    [1, 0, 0, 0, 0, 0, 0],  # 1: operator
    [0, 2, 0, 0, 0, 0, 0],  # 2: symbol
    [0, 0, 3, 3, 0, 0, 0],  # 3: number
    [4, 4, 4, 4, 0, 0, 0],  # 4: identifier
    [5, 5, 5, 5, 6, 5, 5],  # 5: string/char
    [0, 0, 0, 0, 0, 0, 0],  # 6: end string
]

def tokenize_java(line):
    tokens = []
    token = ''
    state = 0
    i = 0
    while i < len(line):
        c = line[i]
        ctype = T_java(c)
        new_state = lexical_state_table_java[state][ctype] if ctype != -1 else 0

        if new_state == 0:
            if token:
                tokens.append(token)
            token = ''
            if c not in [' ', '\t', '\n']:
                tokens.append(c)
            i += 1
            state = 0
        else:
            token += c
            state = new_state
            i += 1

    if token:
        tokens.append(token)
    return tokens

def remove_comment_and_string_java(line):
    line = re.sub(r'//.*', '', line)             
    line = re.sub(r'/\*.*?\*/', '', line, flags=re.DOTALL)
    line = re.sub(r'".*?"', '""', line)         
    line = re.sub(r"'.*?'", "''", line)         
    return line

def get_token_label_java(token):
    if token in java_keywords:
        return 'keyword'
    elif token in java_operators:
        return 'operator'
    elif token in java_symbols:
        return 'symbol'
    elif re.fullmatch(r'[0-9]+(\.[0-9]+)?', token):
        return 'number'
    elif re.fullmatch(r'[a-zA-Z_$][a-zA-Z0-9_$]*', token):
        return 'identifier'
    elif token in ['""', "''"]:
        return 'string_literal'
    else:
        return 'unknown'

def get_label_of_tokens_java(tokens):
    return [get_token_label_java(token) for token in tokens]

def tokenize_java_code(code):
    lines = code.splitlines()
    all_tokens = []
    for line in lines:
        clean_line = remove_comment_and_string_java(line)
        tokens = tokenize_java(clean_line)
        all_tokens.append(tokens)
    return all_tokens
