# -*- coding: utf-8 -*-


!pip install transformers datasets tokenizers peft accelerate bitsandbytes -q



import numpy as np

companies = ['MSFT', 'NVDA', 'AAPL', 'AMZN', 'META', 'AVGO', 'TSLA', 'GOOGL', 'BRK.B', 'GOOG', 'JPM', 'V', 'LLY', 'NFLX', 'XOM', 'MA', 'COST', 'WMT', 'PG', 'HD', 'JNJ', 'ABBV', 'BAC', 'UNH', 'CRM', 'KO', 'PLTR', 'ORCL', 'PM', 'WFC', 'CSCO', 'GE', 'IBM', 'CVX', 'ABT', 'MCD', 'LIN', 'NOW', 'DIS', 'ISRG', 'ACN', 'GS', 'AMD', 'T', 'UBER', 'MRK', 'INTU', 'VZ', 'PEP', 'RTX', 'ADBE', 'BKNG', 'TXN', 'QCOM', 'CAT', 'AXP', 'PGR', 'MS', 'SPGI', 'TMO', 'BA', 'BSX', 'SCHW', 'NEE', 'TJX', 'AMAT', 'C', 'HON', 'AMGN', 'BLK', 'UNP', 'SYK', 'CMCSA', 'ETN', 'LOW', 'PANW', 'DE', 'ADP', 'PFE', 'GILD', 'DHR', 'GEV', 'COP', 'TMUS', 'ADI', 'MMC', 'LRCX', 'BX', 'VRTX', 'MDT', 'FI', 'CRWD', 'KLAC', 'MU', 'CB', 'APH', 'ANET', 'PLD', 'ICE', 'SBUX', 'CME', 'AMT', 'MO', 'TT', 'LMT', 'INTC', 'SO', 'CEG', 'BMY', 'CDNS', 'WELL', 'DUK', 'KKR', 'ELV', 'PH', 'MCK', 'AJG', 'EQIX', 'CI', 'MDLZ', 'SHW', 'WM', 'MMM', 'SNPS', 'TDG', 'AON', 'ORLY', 'CVS', 'COF', 'MCO', 'CTAS', 'UPS', 'NKE', 'PYPL', 'CL', 'WMB', 'CMG', 'PNC', 'MSI', 'ZTS', 'USB', 'GD', 'EMR', 'DASH', 'HCA', 'FTNT', 'ITW', 'EOG', 'HWM', 'APO', 'JCI', 'ADSK', 'BK', 'ECL', 'MAR', 'RCL', 'NOC', 'AZO', 'HLT', 'ROP', 'APD', 'REGN', 'CSX', 'TRV', 'ABNB', 'CARR', 'WDAY', 'FCX', 'NEM', 'CPRT', 'NSC', 'TFC', 'OKE', 'NXPI', 'ALL', 'KMI', 'AXON', 'VST', 'AEP', 'DLR', 'FICO', 'MPC', 'PSX', 'AFL', 'FDX', 'PWR', 'SLB', 'DFS', 'AMP', 'GM', 'ROST', 'PCAR', 'SPG', 'BDX', 'PAYX', 'AIG', 'RSG', 'COR', 'TEL', 'O', 'GWW', 'SRE', 'PSA', 'URI', 'CTVA', 'MET', 'FAST', 'CMI', 'D', 'EW', 'KVUE', 'KDP', 'KMB', 'MSCI', 'KR', 'TGT', 'MNST', 'CCI', 'VRSK', 'VLO', 'EXC', 'IDXX', 'AME', 'F', 'LHX', 'FIS', 'YUM', 'CHTR', 'CTSH', 'XEL', 'PEG', 'CBRE', 'OTIS', 'PRU', 'TTWO', 'BKR', 'HES', 'PCG', 'TRGP', 'RMD', 'HIG', 'GLW', 'CAH', 'LULU', 'VMC', 'MPWR', 'EA', 'WAB', 'SYY', 'ROK', 'DELL', 'DHI', 'ETR', 'ED', 'IT', 'ACGL', 'DXCM', 'EFX', 'EQT', 'NDAQ', 'IR', 'GEHC', 'EBAY', 'MLM', 'VICI', 'MCHP', 'DAL', 'WEC', 'ODFL', 'CSGP', 'A', 'NRG', 'EXR', 'GRMN', 'MTB', 'XYL', 'ANSS', 'WTW', 'OXY', 'CNC', 'GIS', 'STZ', 'AVB', 'IRM', 'DD', 'KEYS', 'STT', 'VTR', 'RJF', 'BR', 'HUM', 'NUE', 'DTE', 'TSCO', 'FANG', 'HPQ', 'TPL', 'IP', 'GDDY', 'FITB', 'AWK', 'UAL', 'PPG', 'BRO', 'AEE', 'DOV', 'LEN', 'CDW', 'FTV', 'PPL', 'VLTO', 'CPAY', 'DRI', 'ATO', 'TYL', 'HSY', 'SBAC', 'CCL', 'SYF', 'IQV', 'EXE', 'CNP', 'KHC', 'ADM', 'EQR', 'HPE', 'HBAN', 'MTD', 'SW', 'TDY', 'CINF', 'CHD', 'SMCI', 'PODD', 'VRSN', 'STE', 'LYV', 'DVN', 'CBOE', 'ES', 'STX', 'K', 'EIX', 'TROW', 'NVR', 'WRB', 'DOW', 'WSM', 'FE', 'AMCR', 'NTRS', 'EXPE', 'HUBB', 'FSLR', 'PHM', 'PTC', 'GPN', 'WBD', 'CMS', 'WAT', 'RF', 'LH', 'NTAP', 'LDOS', 'DECK', 'DG', 'DGX', 'IFF', 'INVH', 'ULTA', 'ON', 'ZBH', 'LII', 'STLD', 'WY', 'LUV', 'MKC', 'MAA', 'HAL', 'JBL', 'CTRA', 'CFG', 'ESS', 'NI', 'BIIB', 'FDS', 'DLTR', 'TRMB', 'MOH', 'GPC', 'TPR', 'PKG', 'SNA', 'PFG', 'WDC', 'DPZ', 'KEY', 'CLX', 'FFIV', 'PNR', 'EXPD', 'COO', 'APTV', 'BALL', 'LNT', 'GEN', 'TSN', 'BAX', 'ROL', 'J', 'L', 'ZBRA', 'LYB', 'EL', 'WST', 'CF', 'OMC', 'EVRG', 'EG', 'LVS', 'AVY', 'BBY', 'IEX', 'KIM', 'MAS', 'BLDR', 'TER', 'TXT', 'ALGN', 'JKHY', 'HOLX', 'UDR', 'CPT', 'ALLE', 'PAYC', 'JNPR', 'FOXA', 'DOC', 'REG', 'JBHT', 'SJM', 'POOL', 'AKAM', 'SWKS', 'CHRW', 'SWK', 'RVTY', 'UHS', 'BG', 'ARE', 'NDSN', 'LKQ', 'HST', 'RL', 'TKO', 'NWSA', 'CAG', 'MOS', 'KMX', 'EPAM', 'VTRS', 'AIZ', 'PNW', 'GL', 'SOLV', 'INCY', 'BXP', 'TAP', 'EMN', 'DAY', 'IPG', 'ERIE', 'AES', 'HII', 'HSIC', 'WYNN', 'NCLH', 'HAS', 'HRL', 'MRNA', 'AOS', 'WBA', 'MKTX', 'MGM', 'GNRC', 'TECH', 'MTCH', 'LW', 'FRT', 'ALB', 'CRL', 'PARA', 'IVZ', 'BEN', 'CPB', 'APA', 'FOX', 'CZR', 'ENPH', 'BF.B']
instructions=['ESG:','Company:','ESG First Order:','ESG Second Order:','ESG Moving Average:','Returns:','Returns First Order:','Returns Second Order:','Sentiment:']

moving_avg_token_map = {}
first_order_token_map = {}
second_order_token_map = {}
esg_token_map = {}
company_for_append=[]
data_set_tokenized_lines = []

e_token_map = {}
s_token_map = {}
g_token_map = {}

import ast

# Load the file content
with open('PaperReady_e_scores.txt', 'r') as file:
    raw_text = file.read()

# Convert string to Python list of dicts
e_data_full_list = ast.literal_eval(raw_text)

# Now `esg_array` is a Python list of dictionaries
print(type(e_data_full_list))         # should be <class 'list'>
print(e_data_full_list[0]['text'])    # shows first record

import ast

# Load the file content
with open('PaperReady_s_scores.txt', 'r') as file:
    raw_text = file.read()

# Convert string to Python list of dicts
s_data_full_list = ast.literal_eval(raw_text)

# Now `esg_array` is a Python list of dictionaries
print(type(s_data_full_list))         # should be <class 'list'>
print(s_data_full_list[0]['text'])    # shows first record

import ast

# Load the file content
with open('PaperReady_g_scores.txt', 'r') as file:
    raw_text = file.read()

# Convert string to Python list of dicts
g_data_full_list = ast.literal_eval(raw_text)

# Now `esg_array` is a Python list of dictionaries
print(type(g_data_full_list))         # should be <class 'list'>
print(g_data_full_list[0]['text'])    # shows first record

import ast

# Load the file content
with open('esg_risk_ratings_1.txt', 'r') as file:
    raw_text = file.read()

# Convert string to Python list of dicts
esg_data_full_list = ast.literal_eval(raw_text)

# Now `esg_array` is a Python list of dictionaries
print(type(esg_data_full_list))         # should be <class 'list'>
print(esg_data_full_list[0]['text'])    # shows first record

import re
def create_train_test_split(data_list, stype,test_size=4):
    data_new_list = []
    actual_dict = {}

    for data in data_list:
        # Extract ESG values
        values = re.findall(r'\d+\.\d+', data['text'])
        data_new_values = values[:-test_size]
        last_values = values[-test_size:]  # Extract the last `test_size` values

        # Convert last_values to floats
        last_values = list(map(float, last_values))

        # Extract company name
        company_match = re.search(r'Company:\s*([A-Z]+)', data['text'])
        if company_match:
            company_name = company_match.group(1)
        else:
            raise ValueError("Company name not found in data['text']")

        # Reconstruct the data dictionary with trimmed ESG values
        trimmed_text = f"Company: {company_name} {stype}: " + " ".join(data_new_values)
        data_new_list.append({'text': trimmed_text})

        # Create the actual dictionary with float values
        actual_dict[company_name] = last_values

    return data_new_list, actual_dict

data_new_list, actual_dict = create_train_test_split(esg_data_full_list,"ESG" ,test_size=20)
#print("Data New List:", data_new_list)
#print("Actual Dictionary:", actual_dict)

data_new_list_e, actual_dict_e = create_train_test_split(e_data_full_list,"ENV", test_size=20)
#print("Data New List:", data_new_list_e)
#print("Actual Dictionary:", actual_dict_e)

data_new_list_s, actual_dict_s = create_train_test_split(s_data_full_list,"SOC", test_size=20)
#print("Data New List:", data_new_list_s)
#print("Actual Dictionary:", actual_dict_s)

data_new_list_g, actual_dict_g = create_train_test_split(g_data_full_list,"GOV", test_size=20)
#print("Data New List:", data_new_list_g)
#print("Actual Dictionary:", actual_dict_g)



import numpy as np

def longest_consistency_streak(values):
    max_streak = 1
    current_streak = 1
    for i in range(1, len(values)):
        if values[i] == values[i-1]:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
    return max_streak

# Kernel extraction functions
def compute_kernels(esg_values,epsilon=0.4):
    esg_values = np.array(esg_values)
    streak = longest_consistency_streak(esg_values)
    if streak >= 10:
        credit = 0.4
    elif streak >= 5:
        credit = 0.25
    else:
        credit = 0.1
    print("esg")
    print(esg_values)
    first_order = np.diff(esg_values)

    # Add small positive noise where diff == 0.0
    #noise = np.random.uniform(0.01, epsilon, size=first_order.shape)+credit
    #first_order = np.where(first_order == 0.0, noise, first_order)
    first_order = np.round(first_order, 2)

    print("first_order")
    print(first_order)

    second_order = np.diff(first_order)

    #noise2 = np.random.uniform(0.01, epsilon, size=second_order.shape)+credit
    #second_order = np.where(second_order == 0.0, noise2, second_order)
    second_order = np.round(second_order, 2)

    print("second_order")
    print(second_order)

    return first_order, second_order


def compute_moving_average_tokens(values, window,kind):
    tokens = []
    for i in range(len(values) - window + 1):
        window_vals = values[i:i+window]
        ma = sum(window_vals) / window
        tokens.append(get_SSNT_format(ma, "MA"))
    return tokens

#def moving_average_to_token(ma_value, window,kind):
 #   return f"<{kind}_{format_diff_token(ma_value)}>"

def format_diff_token(value, kind="FO"):
    sign = "-" if value < 0 else ""
    value = abs(value)
    integer_part = int(value)
    decimal_part = int(round((value - integer_part) * 100))
    formatted = f"{sign}{integer_part:02}.{decimal_part:02}"
    return f"{formatted}"


def format_diff_token_int(value, kind="FO"):
    sign = "-" if value < 0 else ""
    value = abs(value)
    integer_part = int(value)
  #  decimal_part = int(round((value - integer_part) * 100))
    formatted = f"{sign}{integer_part:02}"
    return f"{formatted}"

def get_SSNT_format(numeric_value, kind):
    return f"<{kind}_{format_diff_token(numeric_value)}>"

def get_SSNT_format_RETURN(numeric_value, kind):
    return f"<{kind}_{format_diff_token_int(numeric_value)}>"

esg_core={}
esg_fo={}
esg_so={}
e_core={}
s_core={}
g_core={}
senti_core={}
ret_core={}

data_new_list_e

for record in data_new_list_e:
    text = record['text'].replace('<EOS>', '')

    company = text.split('Company: ')[1].split(' ENV:')[0].strip() # company name
    company_for_append.append(company)
    e_values = text.split('ENV:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    e_values = [float(v) for v in e_values]


    e_tokens = [f"Company: {company} ENV:"] # {format_diff_token(esg_values[0])}
    e_tokens += [get_SSNT_format(v, kind=f"ENV") for v in e_values]
    e_core[company] = " ".join([get_SSNT_format(v, kind=f"ENV") for v in e_values])
    #esg_tokens.append("<EOS>")
    e_token_map[company] = e_tokens

for record in data_new_list_s:
    text = record['text'].replace('<EOS>', '')

    company = text.split('Company: ')[1].split(' SOC:')[0].strip() # company name
    company_for_append.append(company)
    s_values = text.split('SOC:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    s_values = [float(v) for v in s_values]


    s_tokens = [f"Company: {company} SOC:"] # {format_diff_token(esg_values[0])}
    s_tokens += [get_SSNT_format(v, kind=f"SOC") for v in s_values]
    s_core[company] = " ".join([get_SSNT_format(v, kind=f"SOC") for v in s_values])
    #esg_tokens.append("<EOS>")
    s_token_map[company] = s_tokens

for record in data_new_list_g:
    text = record['text'].replace('<EOS>', '')

    company = text.split('Company: ')[1].split(' GOV:')[0].strip() # company name
    company_for_append.append(company)
    g_values = text.split('GOV:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    g_values = [float(v) for v in g_values]


    g_tokens = [f"Company: {company} GOV:"] # {format_diff_token(esg_values[0])}
    g_tokens += [get_SSNT_format(v, kind=f"GOV") for v in g_values]
    g_core[company] = " ".join([get_SSNT_format(v, kind=f"GOV") for v in g_values])
    #esg_tokens.append("<EOS>")
    g_token_map[company] = g_tokens

for record in data_new_list_e:
    text = record['text'].replace('<EOS>', '')

    company = text.split('Company: ')[1].split(' ENV:')[0].strip() # company name
    company_for_append.append(company)
    e_values = text.split('ENV:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    e_values = [float(v) for v in e_values]


    e_tokens = [f"Company: {company} ENV:"] # {format_diff_token(esg_values[0])}
    e_tokens += [get_SSNT_format(v, kind=f"ENV") for v in e_values]
    e_core[company] = " ".join([get_SSNT_format(v, kind=f"ENV") for v in e_values])
    #esg_tokens.append("<EOS>")
    e_token_map[company] = e_tokens

for record in data_new_list:
    text = record['text'].replace('<EOS>', '')

    company = text.split('Company: ')[1].split(' ESG:')[0].strip() # company name
    company_for_append.append(company)
    esg_values = text.split('ESG:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    esg_values = [float(v) for v in esg_values]

    fo, so = compute_kernels(esg_values)
    print(fo)
    print(so)

    esg_tokens = [f"Company: {company} ESG:"] # {format_diff_token(esg_values[0])}
    esg_tokens += [get_SSNT_format(v, kind=f"ESG") for v in esg_values]
    esg_core[company] = " ".join([get_SSNT_format(v, kind=f"ESG") for v in esg_values])
    #esg_tokens.append("<EOS>")
    esg_token_map[company] = esg_tokens
    #fo_tokens.append("<EOS>")



    fo_tokens = [f"Company: {company} ESG First Order:"] # {format_diff_token(esg_values[0])}
    fo_tokens += [get_SSNT_format(v, kind=f"ESGFO") for v in fo]
    esg_fo[company] = " ".join([get_SSNT_format(v, kind=f"ESGFO") for v in fo])
    first_order_token_map[company] = fo_tokens
    #fo_tokens.append("<EOS>")

    so_tokens = [f"Company: {company} ESG Second Order:"] # {format_diff_token(esg_values[0])}
    so_tokens += [get_SSNT_format(v,kind=f"ESGSO") for v in so]
    esg_so[company] = " ".join([get_SSNT_format(v, kind=f"ESGSO") for v in so])
    second_order_token_map[company] = so_tokens
    #so_tokens.append("<EOS>")

    #moving_avg_tokens = [f"Company: {company} ESG Moving Average:"] # {format_diff_token(esg_values[0])}
    #moving_avg_tokens += compute_moving_average_tokens(esg_values,5,company)
    #moving_avg_token_map[company] = moving_avg_tokens


    #text+=" <EOS>"
    #data_set_tokenized_lines.append("".join(text))
    data_set_tokenized_lines.append(" ".join(esg_tokens))

    data_set_tokenized_lines.append(" ".join(e_tokens))
    data_set_tokenized_lines.append(" ".join(s_tokens))
    data_set_tokenized_lines.append(" ".join(g_tokens))

    data_set_tokenized_lines.append(" ".join(fo_tokens))
    data_set_tokenized_lines.append(" ".join(so_tokens))
    #data_set_tokenized_lines.append(" ".join(moving_avg_tokens))

g_core['AAPL']

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer


#with open("training_data_set_company.txt", "w") as f:
 #   for line in tokenized_lines:
  #      f.write(line + "\n")
fo_tokens = []
so_tokens = []
ma_tokens = []
esg_tokens = []
return_1y_tokens=[]
return_1y_tokens_fo=[]
return_1y_tokens_so=[]

e_tokens=[]
s_tokens=[]
g_tokens=[]
# Token range for FO (first order) and SO (second order) kernels
#fo_tokens = [f"<FO_DIFF_{i}>" for i in range(0, 5001, 100)]  # e.g., 0 to 5000 by 100
#so_tokens = [f"<SO_CURV_{i}>" for i in range(0, 5001, 100)]



#fo_tokens = [f"<FO_DIFF_{a:02}.{b:02}>" for a in range(-100,100) for b in range(100)]
#fo_tokens = [f"<FO_DIFF_{i}>" for i in range(-2500, 2501, 100)]  # -25.00 to +25.00
for a in range(0, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              esg_tokens.append(f"<ESG_{sign}{abs(a):02}.{abs(b):02}>")

for a in range(-99, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              fo_tokens.append(f"<ESGFO_{sign}{abs(a):02}.{abs(b):02}>")
      #-00.
for b in range(100):
              sign = "-"
              fo_tokens.append(f"<ESGFO_{sign}00.{abs(b):02}>")



for a in range(-99, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              so_tokens.append(f"<ESGSO_{sign}{abs(a):02}.{abs(b):02}>")

      #-00.
for b in range(100):
              sign = "-"
              so_tokens.append(f"<ESGSO_{sign}00.{abs(b):02}>")


for a in range(-99, 100):
    sign = "-" if a < 0 else ""
    return_1y_tokens.append(get_SSNT_format_RETURN(a,"RET"))

      #-00.
for b in range(100):
      sign = "-"
      return_1y_tokens.append(get_SSNT_format_RETURN(b,"RET"))



for a in range(0, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              e_tokens.append(f"<ENV_{sign}{abs(a):02}.{abs(b):02}>")


for a in range(0, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              s_tokens.append(f"<SOC_{sign}{abs(a):02}.{abs(b):02}>")


for a in range(0, 100):
          for b in range(100):
              sign = "-" if a < 0 or (a == 0 and b < 0) else ""
              g_tokens.append(f"<GOV_{sign}{abs(a):02}.{abs(b):02}>")




#for a in range(-100, 100):
 #   sign = "-" if a < 0 else ""
  #  return_ytd_tokens.append(f"<RETYTD_{sign}{abs(a):02}>")



#so_tokens = [f"<SO_CURV_{a:02}.{b:02}>" for a in range(-100,100) for b in range(100)]

#so_tokens = [f"<SO_CURV_{i}>" for i in range(-2500, 2501, 100)]

special_tokens = ["<PAD>", "<EOS>", "<UNK>"]
all_tokens =  fo_tokens + so_tokens+esg_tokens+["<SENTI_10>","<SENTI_-10>","<SENTI_00>"] +  return_1y_tokens + e_tokens+s_tokens+g_tokens


# Train Tokenizer
special_tokens = all_tokens + companies + special_tokens+instructions

all_possible_tokens = special_tokens

import numpy as np
import re

import numpy as np
import re

def generate_blockwise_series_embedding(token: str, dim: int = 768, scale: float = 1.0) -> np.ndarray:
    """
    Embedding with directional changes based on numeric value.
    Ensures cosine similarity decreases as numeric difference increases.
    """
    # Parse
    match = re.match(r"^<([A-Z]+)_(-?\d+(?:\.\d+)?)>$", token)
    if not match:
        raise ValueError(f"Invalid token: {token}")
    series_prefix, numeric_value = match.groups()
    numeric_value = float(numeric_value)

    # Series mapping
    series_keys = ["RET", "SOC", "GOV", "ESG", "ESGFO", "ESGSO", "ENV", "SENTI"]
    block_size = dim // len(series_keys)
    start_idx = series_keys.index(series_prefix) * block_size

    # Base template
    x = np.linspace(0, 1, block_size)
    block = np.sin((x + numeric_value / 100) * np.pi) * scale  # phase-shifted sine

    # Add small quadratic term for non-linearity
    block += (x ** 2) * (numeric_value / 100) * 0.5

    # Normalize block
    block = (block - block.mean()) / (block.std() + 1e-8)

    # Place in full embedding
    pe = np.zeros(dim)
    pe[start_idx:start_idx + block_size] = block
    return pe



from sklearn.decomposition import PCA
from itertools import product
import numpy as np

# === Step 1: Define Token Set ===
# You already have: all_tokens = ["<ESG_70.0>", "<RET_10.0>", ...]
# We use this as-is.

##best_config = optimize_series_base_indices(all_tokens)

import torch
from transformers import GPT2Tokenizer, GPT2Model
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import numpy as np
import re







import ast

# Step 1: Read the file
with open('weekly_sentiments.txt', 'r') as file:
    content = file.read()

# Step 2: Convert the string to a dictionary safely
raw_sentiments = ast.literal_eval(content)

# Step 3: (Optional) Print or inspect
print(raw_sentiments)

import random

sentiment_choices = ['<SENTI_POSITIVE>', '<SENTI_NEUTRAL>', '<SENTI_NEGATIVE>']
# Mapping for replacement
sentiment_map = {
    '<SENTI_POSITIVE>': '<SENTI_10>',
    '<SENTI_NEUTRAL>': '<SENTI_00>',
    '<SENTI_NEGATIVE>': '<SENTI_-10>',
}

sentiment_series = {}
#raw_sentiments = {}


for company in companies:
  try:
      transformed_series = [sentiment_map[token] for token in raw_sentiments[company]]
      sentiment_series[company] = transformed_series
  except:
    print("skipped for sentiment",company)

senti_core = sentiment_series

return_series = {}

#for company in companies:
 #   # Generate 12 random returns between -30% and +30% with 1 decimal place
  #  return_series[company] = [round(random.randint(-30, 30), 1) for _ in range(12)] # Original Data

import ast

def parse_and_scale_file(filename):
    data_dict = {}
    with open(filename, 'r') as file:
        for line in file:
            if ':' in line:
                key, value = line.strip().split(':', 1)
                values = ast.literal_eval(value.strip())
                scaled_values = [round(x * 100) for x in values]
                data_dict[key.strip()] = scaled_values
    return data_dict

# Usage
filename = 'monthly_returns.txt'
return_series = parse_and_scale_file(filename)

return_strings = []

for company, returns in return_series.items():
    # Convert numbers to strings and join with spaces
    return_text = " ".join(map(str, returns))
    ret_core[company] = return_text

    return_strings.append({
        'text': f"Company: {company} Returns: {return_text}"
    })

# Convert to the requested format
sentiment_strings = []

for company, sentiments in sentiment_series.items():
    # Join all sentiment values with spaces (no week numbers or tuples)
    sentiment_text = " ".join(sentiments)
    senti_core[company] = sentiment_text

    # Format as requested
    sentiment_strings.append({
        'text': f"Company: {company} Sentiment: {sentiment_text}"
    })

# Print the result
#for item in sentiment_strings:
    #print(item)

fo_r_tokens = []
so_r_tokens = []
esg_tokens = []
return_1y_tokens=[]
senti_values=[]

for record in sentiment_strings:
    text = record['text'].replace('<EOS>', '')
    #print("Direct from Data")
    #print(text)
    company = text.split('Company: ')[1].split(' Sentiment:')[0].strip() # company name
    company_for_append.append(company)
    senti_values = text.split('Sentiment:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    senti_tokens = [f"Company: {company} Sentiment:"] # {format_diff_token(esg_values[0])}
    senti_tokens += [v for v in senti_values]
    data_set_tokenized_lines.append(" ".join(senti_tokens))

for record in return_strings:
    text = record['text'].replace('<EOS>', '')
    #print("Direct from Data")
   # print(text)
    company = text.split('Company: ')[1].split(' Returns:')[0].strip() # company name
    company_for_append.append(company)
    r_values = text.split('Returns:')[1].replace('<EOS>', '').strip().split() # ['80.00', '78.00', '79.00', '81.00']
    #esg_values = text.split('ESG:')[1].strip().split() # ['80.00', '78.00', '79.00', '81.00']

    #prompt1=text

    r_values = [float(v) for v in r_values]

    #fo, so = compute_kernels(r_values)
    #print(fo)
    #print(so)

    r_tokens = [f"Company: {company} Returns:"] # {format_diff_token(esg_values[0])}
    r_tokens += [get_SSNT_format_RETURN(v, kind=f"RET") for v in r_values]
    ret_core[company] = " ".join([get_SSNT_format_RETURN(v, kind=f"RET") for v in r_values])

    #esg_token_map[company] = esg_tokens


    #so_tokens.append("<EOS>")

    #moving_avg_tokens = [f"Company: {company} ESG Moving Average:"] # {format_diff_token(esg_values[0])}
    #moving_avg_tokens += compute_moving_average_tokens(esg_values,5,company)
    #moving_avg_token_map[company] = moving_avg_tokens


    #text+=" <EOS>"
    #data_set_tokenized_lines.append("".join(text))
    data_set_tokenized_lines.append(" ".join(r_tokens))

## === Step 2: Create Token Embeddings ===
token_types = ["ESG", "RET", "ESGFO", "ESGSO", "RET", "ENV","SOC","GOV","SENTI"]
#values = [50, 60, 70, 80, 90]
tokens = []
embeddings = []
token_embedding_dict = {}
embd_1 = None
for token in all_tokens:
      embd_1 = generate_blockwise_series_embedding(token)
      embeddings.append(embd_1)
      token_embedding_dict[token]= embd_1
  #if token in ['<SENTI_POSITIVE>','<SENTI_NEUTRAL>','<SENTI_NEGATIVE>']:
   #   embeddings.append(generate_sentiment_embedding(token))
    #  token_embedding_dict[token] = generate_sentiment_embedding(token)
#for t_type in token_types:
 #   for v in values:
  #      tok = f"<{t_type}_{v:.2f}>"
   #     tokens.append(tok)

def print_series_block_map(series_keys, dim=768):
    block_size = dim // len(series_keys)
    for i, key in enumerate(series_keys):
        start = i * block_size
        end = start + block_size - 1
        print(f"{key:6} → positions {start:3} to {end:3}")

print_series_block_map(["RET", "SOC", "GOV", "ESG", "ESGFO", "ESGSO", "ENV", "SENTI"])

from sklearn.preprocessing import StandardScaler

all_embeddings = np.array(list(token_embedding_dict.values()))
all_embeddings = StandardScaler().fit_transform(all_embeddings)







grouped_tokens = {
    'ESG': [],
    'ESGFO': [],
    'ESGSO': [],
    'RET': [],
    'ENV': [],
    'SOC': [],
    'GOV': [],
    'SENTI': []
}

# Group embeddings and tokens
for token, embedding in token_embedding_dict.items():
    for prefix in grouped_tokens:
        if token.startswith(f"<{prefix}_"):
            grouped_tokens[prefix].append((token, embedding))
            break

# Reduce embeddings to 2D using PCA
all_embeddings = [embedding for group in grouped_tokens.values() for _, embedding in group]
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(all_embeddings)

# Plot
plt.figure(figsize=(12, 8))

colors = ['red', 'blue', 'green', 'orange', 'purple', 'yellow', 'brown','pink']
prefixes = list(grouped_tokens.keys())
start = 0

for i, prefix in enumerate(prefixes):
    group = grouped_tokens[prefix]
    count = len(group)
    if count == 0:
        continue
    x = reduced_embeddings[start:start + count, 0]
    y = reduced_embeddings[start:start + count, 1]
   # plt.scatter(x, y, label=prefix, color=colors[i])
    start += count

#plt.title("Token Embeddings by Series (PCA Projection)")
#plt.xlabel("PC1")
#plt.ylabel("PC2")
#plt.legend()
#plt.grid(True)
#plt.tight_layout()
#plt.show()

grouped_tokens = {
    'ESG': [],
    'ESGFO': [],
    'ESGSO': [],
    'RET': [],
    'ENV': [],
    'SOC': [],
    'GOV': [],
    'SENTI': []
}



import numpy as np

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Compute cosine similarity between two vectors"""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2 + 1e-10)  # add epsilon to avoid division by zero

import numpy as np
import matplotlib.pyplot as plt

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return float(dot_product / (norm1 * norm2 + 1e-10))

def format_diff_token(value: float, is_integer: bool = False) -> str:
    sign = "-" if value < 0 else ""
    value = abs(value)
    return f"{sign}{int(value):02d}" if is_integer else f"{sign}{int(value):02d}.{int(round((value - int(value)) * 100)):02d}"

def get_SSNT_format(value, kind):
    int_token_types = {"RET",  "SENTI"}
    is_int = kind in int_token_types
    return f"<{kind}_{format_diff_token(value, is_integer=is_int)}>"

def plot_similarity_decay_all(token_embedding_dict,token_configs):
    plt.rcParams.update({'font.size': 12})
    fig, ax = plt.subplots(figsize=(5, 5))

    marker_cycle = ['o', 's', 'D', '^', 'v', 'X', '*']
    color_cycle = plt.cm.tab10.colors  # use matplotlib color cycle


    base_tokens = {}

    for i, cfg in enumerate(token_configs):
        kind = cfg["kind"]
        base = cfg["base"]
        is_int = cfg["int"]
        is_cat = cfg.get("categorical", False)

        marker = marker_cycle[i % len(marker_cycle)]
        color = color_cycle[i % len(color_cycle)]
        x_offset = (i - 3) * 0.15  # shift lines slightly along X-axis

        base_token = get_SSNT_format(base, kind)
        base_tokens[kind] = base_token

        if base_token not in token_embedding_dict:
            print(f"Base token {base_token} not found. Skipping.")
            continue

        base_vec = token_embedding_dict[base_token]

        if is_cat:
            steps = cfg["steps"]
            x_labels = ["Negative", "Neutral", "Positive"]
            x_vals = [j + x_offset for j in range(len(steps))]
            y_vals = []
            for val in steps:
                tok = get_SSNT_format(val, kind)
                if tok in token_embedding_dict:
                    sim = cosine_similarity(base_vec, token_embedding_dict[tok])
                    y_vals.append(sim)
            if y_vals:
                ax.plot(x_vals, y_vals, marker=marker, label=kind, linewidth=5.0, color=color)
                ax.set_xticks([0, 1, 2])
                ax.set_xticklabels(["-10", "00", "+10"])
        else:
            steps = [base + j * cfg["step"] for j in range(-cfg["range"], cfg["range"] + 1)]
            x_vals = []
            y_vals = []
            for val in steps:
                tok = get_SSNT_format(val, kind)
                if tok in token_embedding_dict:
                    sim = cosine_similarity(base_vec, token_embedding_dict[tok])
                    distance = abs(val - base) #Removed abs
                    x_vals.append(distance + x_offset)
                    y_vals.append(sim)
            if x_vals:
                ax.plot(x_vals, y_vals, marker=marker, label=kind, linewidth=2.5, color=color)

    ax.set_ylim(0.75, 1.0)
    ax.set_xlim(0, 14)
    ax.set_xlabel("Numeric Distance from Base Token", fontsize=12)
    ax.set_ylabel("Cosine Similarity", fontsize=12)
    ax.set_title("Similarity vs Numeric Distance", fontsize=12)
    ax.tick_params(axis='x', labelrotation=45)
    ax.grid(True, linestyle='--', alpha=1)
    ax.legend(title="Token Type", fontsize=12, loc='best')
    plt.tight_layout()
    plt.show()

    print("\n=== Base Tokens Used ===")
    for kind, tok in base_tokens.items():
        print(f"{kind:<6} → {tok}")

import numpy as np
import matplotlib.pyplot as plt

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return float(dot_product / (norm1 * norm2 + 1e-10))

def format_diff_token(value: float, is_integer: bool = False) -> str:
    sign = "-" if value < 0 else ""
    value = abs(value)
    return f"{sign}{int(value):02d}" if is_integer else f"{sign}{int(value):02d}.{int(round((value - int(value)) * 100)):02d}"

def get_SSNT_format(value, kind):
    int_token_types = {"RET",  "SENTI"}
    is_int = kind in int_token_types
    return f"<{kind}_{format_diff_token(value, is_integer=is_int)}>"

def plot_similarity_decay_all_new(token_embedding_dict, token_configs):
    plt.rcParams.update({'font.size': 12})
    fig, ax = plt.subplots(figsize=(6, 5))

    marker_cycle = ['o', 's', 'D', '^', 'v', 'X', '*', 'P']
    # Use a bright, gaudy color palette
    color_cycle = ['magenta', 'cyan', 'lime', 'red', 'orange', 'blue', 'purple', 'gold']

    base_tokens = {}

    for i, cfg in enumerate(token_configs):
        kind = cfg["kind"]
        base = cfg["base"]
        is_int = cfg["int"]
        is_cat = cfg.get("categorical", False)

        marker = marker_cycle[i % len(marker_cycle)]
        color = color_cycle[i % len(color_cycle)]
        x_offset = (i - 3) * 0.15  # shift lines slightly along X-axis

        base_token = get_SSNT_format(base, kind)
        base_tokens[kind] = base_token

        if base_token not in token_embedding_dict:
            print(f"Base token {base_token} not found. Skipping.")
            continue

        base_vec = token_embedding_dict[base_token]

        if is_cat:
            steps = cfg["steps"]
            x_labels = ["Negative", "Neutral", "Positive"]
            x_vals = [j + x_offset for j in range(len(steps))]
            y_vals = []
            for val in steps:
                tok = get_SSNT_format(val, kind)
                if tok in token_embedding_dict:
                    sim = cosine_similarity(base_vec, token_embedding_dict[tok])
                    y_vals.append(sim)
            if y_vals:
                ax.plot(x_vals, y_vals, marker=marker, label=kind, linewidth=5.0, color=color)
                ax.set_xticks([0, 1, 2])
                ax.set_xticklabels(["-10", "00", "+10"])
        else:
            steps = [base + j * cfg["step"] for j in range(-cfg["range"], cfg["range"] + 1)]
            x_vals = []
            y_vals = []
            for val in steps:
                tok = get_SSNT_format(val, kind)
                if tok in token_embedding_dict:
                    sim = cosine_similarity(base_vec, token_embedding_dict[tok])
                    distance = abs(val - base)
                    x_vals.append(distance + x_offset)
                    y_vals.append(sim)
            if x_vals:
                ax.plot(x_vals, y_vals, marker=marker, label=kind, linewidth=2.5, color=color)

    ax.set_ylim(0.8, 1.0)
    ax.set_xlim(0, 14)
    ax.set_xlabel("Numeric Distance from Base Token", fontsize=12)
    ax.set_ylabel("Cosine Similarity", fontsize=12)
    ax.set_title("Similarity vs Numeric Distance", fontsize=12,fontweight='bold')
    ax.tick_params(axis='x', labelrotation=45)
    ax.grid(True, linestyle='--', alpha=1)
    ax.legend(title="Token Type", fontsize=14, loc='best')
    plt.tight_layout()
    plt.show()

    print("\n=== Base Tokens Used ===")
    for kind, tok in base_tokens.items():
        print(f"{kind:<6} → {tok}")

import pandas

import numpy as np

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Compute cosine similarity between two vectors"""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2 + 1e-10)  # add epsilon to avoid division by zero

#+ {c} SOC Risk: {s_core[c]} + {c} GOV Risk: {g_core[c]}

import random

# Shuffle and split companies
random.seed(42)
companies_shuffled = random.sample(companies, len(companies))
split_index = int(0.8 * len(companies))
train_companies = companies_shuffled[:split_index]
test_companies = companies_shuffled[split_index:]


data_set_tokenized_samples_training = []
data_set_tokenized_samples_testing = []


with open("training_data_set_company_kernel.txt", "w") as f:
    for c in train_companies:
        try:
            line = f"Company: {c} ENV: {e_core[c]}  {c} SOC Risk: {s_core[c]}  {c} GOV Risk: {g_core[c]}  SENTI: {senti_core[c]}  ESG: {esg_core[c]} RET: {ret_core[c]}"
            data_set_tokenized_samples_training.append(line)
            f.write(line + "\n")
        except:
            continue
with open("testing_dataset_kernel.txt", "w") as f:
    for c in test_companies:
        try:
            line = f"Company: {c} ENV: {e_core[c]}  {c} SOC Risk: {s_core[c]}  {c} GOV Risk: {g_core[c]}  SENTI: {senti_core[c]}  ESG: {esg_core[c]} RET: {ret_core[c]}"
            data_set_tokenized_samples_testing.append(line)

            f.write(line + "\n")
        except:
          continue

from transformers import GPT2TokenizerFast

tokenizer_extended = GPT2TokenizerFast(
    vocab_file="vocab.json",
    merges_file="merges.txt")

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

tokenizer_extended.add_tokens(all_possible_tokens, special_tokens=True)

tokenizer_extended.pad_token = "<PAD>"
tokenizer_extended.eos_token = "<EOS>"
tokenizer_extended.unk_token = "<UNK>"



print(len(tokenizer_extended))

#from google.colab import drive
#drive.mount('/content/drive')

# Save once
#tokenizer_extended.save_pretrained("/content/drive/MyDrive/gpt2_tokenizer")



#with torch.no_grad():
 #    for token, embedding in token_embedding_dict.items():
  #      idx = tokenizer_extended.convert_tokens_to_ids(token)
   #     model.transformer.wte.weight[idx] = torch.tensor(embedding)



!pip install dtaidistance

data_set_tokenized_samples=data_set_tokenized_samples_training



#dataset = PricePredictionDataset(data_set_tokenized_samples, tokenizer_extended)



data_set_tokenized_samples

!unzip 'GEN_RET1_80.zip' -d /GEN_RET1_80/

#****************content/esg_finetuned_gpt2_v2_0/
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
import torch
from peft import PeftModel, PeftConfig
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load tokenizer
#tokenizer = PreTrainedTokenizerFast(tokenizer_file="ESG_2_tokenizer.json")

from transformers import PreTrainedTokenizerFast

tokenizer_extended = PreTrainedTokenizerFast.from_pretrained("/GEN_RET1_80/content/GEN_RET1_80")

tokenizer_extended.add_special_tokens({'pad_token': '<PAD>', 'eos_token': '<EOS>', 'unk_token': '<UNK>'})
print(len(tokenizer_extended))
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
base_model.resize_token_embeddings(len(tokenizer_extended))    # IMPORTANT!

# Step 3: Load LoRA adapters on resized model
model = PeftModel.from_pretrained(base_model, "/GEN_RET1_80/content/GEN_RET1_80")




#model = model.merge_and_unload()
model.config.pad_token_id = tokenizer_extended.pad_token_id  # needed for loss masking
# Step 4: Move to device and set eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()



import torch
from transformers import GPT2LMHeadModel

# === Use your trained model (already LoRA + custom embeddings) ===
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def quick_test(model, tokenizer, prompt, max_new_tokens=30):
    """Generate text from a prompt to test model behavior."""
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    with torch.no_grad():

        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # deterministic for quick eval
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# === Example prompt: ENV + SENTI → ESG ===
prompt = "Company: RCL ENV: <ENV_55.00> <ENV_62.00> <ENV_68.00> SENTI: <SENTI_10> <SENTI_00> ESG:"

generated_text = quick_test(model, tokenizer_extended, prompt)
print("=== Generated ===")
print(generated_text)


# === Quick generic Q&A test ===
random_prompts = [
    "What is the capital of France?",
    "Write a short sentence about sustainability.",
    "Explain in one line what ESG stands for.",
    "Give me a random number between 1 and 100.",
    "What is 2 + 2?"
]
#model.disable_adapter()  # Turns off LoRA (uses base GPT-2 weights)
print(model.peft_config)
print(model.active_adapter)
#model.set_adapter("default")     # Enable LoRA (default adapter)

for prompt in random_prompts:

    response = quick_test(model, tokenizer_extended, prompt, max_new_tokens=40)
    print(f"\nPrompt: {prompt}\nResponse: {response}")

print(type(model))

import time
while True:
    time.sleep(60)

def generate_esg_series(model, tokenizer, company_line, max_new_tokens=30):
    """
    Generate ESG series for a company using ENV + SOC + SENTI as context
    and seeding with the first actual ESG token.
    """
    # Extract the first ESG token
    actual_esg_values = re.findall(r"<ESG_([-+]?\d*\.\d+|\d+)>", company_line)
    seed_value = actual_esg_values[0] if actual_esg_values else "70.00"  # fallback

    # Build the prompt: everything up to ESG plus the first actual ESG value
    prompt = company_line.split("ESG:")[0] + f"ESG: <ESG_{seed_value}> "
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

    # Generate continuation
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # greedy
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and extract predicted ESG values
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print("*** GENERATED TOKENS***")
    print(generated_text)
    print("*** /GENERATED TOKENS***")
    pred_part = generated_text.split("ESG:")[-1].split(tokenizer.eos_token)[0]
    pred_values = [float(x) for x in re.findall(r"<ESG_([-+]?\d*\.\d+|\d+)>", pred_part)]
    print("*** pred_values ***")
    print(pred_values)
    print("*** /pred_values***")
    return np.array(pred_values, dtype=float)

gaudy_colors = [
    'magenta', 'lime', 'cyan', 'orange', 'red',
    'blue', 'gold', 'purple', 'deeppink', 'chartreuse',
    'aqua', 'orangered', 'springgreen', 'yellow', 'violet',
    'dodgerblue', 'crimson', 'limegreen', 'turquoise', 'hotpink',
    'lawngreen', 'coral', 'fuchsia', 'mediumspringgreen', 'darkorange'
]

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

def evaluate_and_plot(model, tokenizer, test_samples):
    all_metrics = []
    plt.figure(figsize=(12, 6 * len(test_samples)))  # one subplot per company

    for idx, line in enumerate(test_samples):
        # Extract actual ESG series
        company_name = re.search(r"Company:\s*([A-Za-z0-9]+)", line).group(1)
        actual_values = np.array(
            [float(x) for x in re.findall(r"<ESG_([-+]?\d*\.\d+|\d+)>", line.split("ESG:")[1])],
            dtype=float
        )

        # Generate predictions
        predicted_values = generate_esg_series(model, tokenizer, line, max_new_tokens=20)

        # Align lengths
        min_len = min(len(actual_values), len(predicted_values))
        actual_values = actual_values[:min_len]
        predicted_values = predicted_values[:min_len]

        if len(predicted_values) == 0:
            print(f"{company_name} → No ESG values generated. Skipping.")
            continue

        # Compute MSE
        mse = mean_squared_error(actual_values, predicted_values)
        all_metrics.append((company_name, mse))

        # Print actual vs predicted values
        print(f"\n=== {company_name} ===")
        print(f"Actual ESG:    {actual_values}")
        print(f"Predicted ESG: {predicted_values}")
        print(f"MSE: {mse:.4f}")

        # === Plot ===
        plt.subplot(len(test_samples), 1, idx + 1)
        plt.plot(actual_values, label=f"{company_name} Actual ESG", linewidth=2.5, color='magenta')
        plt.plot(predicted_values, label=f"{company_name} Predicted ESG", linewidth=2.5, color='lime')
        plt.title(f"{company_name} ESG Forecast vs Actual (MSE={mse:.2f})", fontsize=12, fontweight='bold')
        plt.xlabel("Time Steps", fontsize=12)
        plt.ylabel("ESG", fontsize=12)
        plt.legend(fontsize=12)
        plt.grid(alpha=0.3)


    plt.tight_layout()
    plt.show()

    # Aggregate metrics across companies
    if all_metrics:
        mean_mse = np.mean([m[1] for m in all_metrics])
        print("\n=== Aggregate Metrics Across Companies ===")
        for company, mse in all_metrics:
            print(f"{company}: MSE={mse:.4f}")
        print(f"\nAverage MSE across companies: {mean_mse:.4f}")

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import re, numpy as np

def evaluate_and_plot(model, tokenizer, test_samples, max_new_tokens=20):
    """
    Evaluates and plots predictions for each company.
    Returns a dictionary with actual & predicted values + MSE.
    """
    results = {}
    plt.figure(figsize=(12, 6 * len(test_samples)))  # one subplot per company

    for idx, line in enumerate(test_samples):
        # Extract company name
        company_name = re.search(r"Company:\s*([A-Za-z0-9]+)", line).group(1)

        # Extract actual ESG series
        actual_values = np.array(
            [float(x) for x in re.findall(r"<ESG_([-+]?\d*\.\d+|\d+)>", line.split("ESG:")[1])],
            dtype=float
        )

        # Generate predictions
        predicted_values = generate_esg_series(model, tokenizer, line, max_new_tokens=max_new_tokens)

        # Align lengths
        min_len = min(len(actual_values), len(predicted_values))
        actual_values = actual_values[:min_len]
        predicted_values = predicted_values[:min_len]

        if len(predicted_values) == 0:
            print(f"{company_name} → No ESG values generated. Skipping.")
            continue

        # Compute MSE
        mse = mean_squared_error(actual_values, predicted_values)

        # Save results for external use
        results[company_name] = {
            "actual": actual_values,
            "predicted": predicted_values,
            "mse": mse
        }

        # === Print for quick check ===
        print(f"\n=== {company_name} ===")
        print(f"Actual ESG:    {actual_values}")
        print(f"Predicted ESG: {predicted_values}")
        print(f"MSE: {mse:.4f}")


    plt.tight_layout()
    plt.show()

    # === Aggregate metrics across companies ===
    if results:
        mean_mse = np.mean([v["mse"] for v in results.values()])
        print("\n=== Aggregate Metrics Across Companies ===")
        for company, data in results.items():
            print(f"{company}: MSE={data['mse']:.4f}")
        print(f"\nAverage MSE across companies: {mean_mse:.4f}")

    return results

results = evaluate_and_plot(model, tokenizer_extended, data_set_tokenized_samples_testing)

results

import numpy as np
import matplotlib.pyplot as plt



# Create individual plots
for company, data in results.items():
    actual = data['actual']
    predicted = data['predicted']
    mse = data['mse']

    plt.figure(figsize=(4, 4))
    plt.plot(actual, label='Actual', color='magenta', linewidth=2.5)
    plt.plot(predicted, label='Predicted', color='lime', linewidth=2.5)
    plt.title(f"{company} ESG Forecast vs Actual\nMSE: {mse:.2f}", fontsize=12)
    plt.xlabel("Time Steps", fontsize=12)
    plt.ylabel("ESG", fontsize=12)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import re, numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    """MAPE with safe handling for zeros."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero_idx = y_true != 0
    if np.any(nonzero_idx):
        return np.mean(np.abs((y_true[nonzero_idx] - y_pred[nonzero_idx]) / y_true[nonzero_idx])) * 100
    return np.nan

def evaluate_and_plot(model, tokenizer, test_samples, max_new_tokens=20):
    """
    Evaluates and plots predictions for each company.
    Returns a dictionary with actual & predicted values + metrics.
    """
    results = {}
    plt.figure(figsize=(12, 6 * len(test_samples)))  # one subplot per company

    for idx, line in enumerate(test_samples):
        # Extract company name
        company_name = re.search(r"Company:\s*([A-Za-z0-9]+)", line).group(1)

        # Extract actual ESG series
        actual_values = np.array(
            [float(x) for x in re.findall(r"<ESG_([-+]?\d*\.\d+|\d+)>", line.split("ESG:")[1])],
            dtype=float
        )

        # Generate predictions
        predicted_values = generate_esg_series(model, tokenizer, line, max_new_tokens=max_new_tokens)

        # Align lengths
        min_len = min(len(actual_values), len(predicted_values))
        actual_values = actual_values[:min_len]
        predicted_values = predicted_values[:min_len]

        if len(predicted_values) == 0:
            print(f"{company_name} → No ESG values generated. Skipping.")
            continue

        # === Compute Metrics ===
        mse = mean_squared_error(actual_values, predicted_values)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(actual_values, predicted_values)
        mape = mean_absolute_percentage_error(actual_values, predicted_values)
        bias = np.mean(predicted_values - actual_values)

        # Save results
        results[company_name] = {
            "actual": actual_values,
            "predicted": predicted_values,
            "mse": mse,
            "rmse": rmse,
            "mae": mae,
            "mape": mape,
            "bias": bias
        }

        # Print per company
        print(f"\n=== {company_name} ===")
        print(f"Actual ESG:    {actual_values}")
        print(f"Predicted ESG: {predicted_values}")
        print(f"MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f} | MAPE: {mape:.2f}% | Bias: {bias:.4f}")

    plt.tight_layout()
    plt.show()

    # === Aggregate metrics ===
    if results:
        mean_mse = np.mean([v["mse"] for v in results.values()])
        mean_rmse = np.mean([v["rmse"] for v in results.values()])
        mean_mae = np.mean([v["mae"] for v in results.values()])
        mean_mape = np.nanmean([v["mape"] for v in results.values()])
        mean_bias = np.mean([v["bias"] for v in results.values()])

        print("\n=== Aggregate Metrics Across Companies ===")
        for company, data in results.items():
            print(f"{company}: MSE={data['mse']:.4f}, RMSE={data['rmse']:.4f}, MAE={data['mae']:.4f}, MAPE={data['mape']:.2f}%, Bias={data['bias']:.4f}")
        print(f"\nAverage MSE: {mean_mse:.4f}")
        print(f"Average RMSE: {mean_rmse:.4f}")
        print(f"Average MAE: {mean_mae:.4f}")
        print(f"Average MAPE: {mean_mape:.2f}%")
        print(f"Average Bias: {mean_bias:.4f}")

    return results

results = evaluate_and_plot(model, tokenizer_extended, data_set_tokenized_samples_testing)
