
""" Functions for python file tokenization """

import ast
import tokenize
import chardet
import keyword
import builtins

def tokenize_file(source_file, line_numbers=[], include_names=True):
    """
    Returns string, number and name tokens for a given Python file
    Only includes tokens that start or end on the specified line numbers
    (or all tokens if line_numbers is empty)

    The 'name' tokens are valid identifiers for variables, function names etc.
    Python keywords and built-in functions are excluded
    (this could result in different outcomes for different versions of python)
    """

    strings = set(); numbers = set(); names = set()

    with open(source_file, "r", encoding='utf-8') as f:
        token_generator = tokenize.generate_tokens(f.readline)
        for token in token_generator:
            if (token.start[0] in line_numbers or token.end[0] in line_numbers
            or len(line_numbers) == 0):  # specified line numbers only
                match token.type:
                    case tokenize.STRING:
                        try:
                            # extract strings from string tokens using literal eval
                            # may fail, e.g. for f-strings
                            tokstr = ast.literal_eval(token.string)
                            if isinstance(tokstr, str):
                                if len(tokstr) > 0:
                                    strings.add(tokstr)
                            elif isinstance(tokstr, bytes): # handle b-strings
                                if len(tokstr) > 0:
                                    strings.add(tokstr.decode(chardet.detect(tokstr)['encoding']))  # uses best guess for encoding
                            else:
                                raise(Exception("Output of literal_eval is not a string (type "+str(type(tokstr))+")"))
                        except Exception as e:
                            # skip token if string literal can't be evaluated
                            print("Skipping token: "+token.string)
                            print(str(e))
                            continue
                    case tokenize.NUMBER:
                        numbers.add(token.string)
                    case tokenize.NAME:
                        if include_names:  # process identifiers
                            if not (keyword.iskeyword(token.string)         # exclude python keywords
                                    or keyword.issoftkeyword(token.string)  # exclude soft keywords
                                    or token.string in dir(builtins)):      # exclude built-in functions
                                names.add(token.string)

    return strings, numbers, names  # names will be empty if include_names is False
