import spacy
nlp = spacy.load('en_core_web_lg')
import pandas as pd
sst_sentences = pd.read_csv('SST2/datasetSentences.txt',sep='\t',header=[0],index_col=0)
sst_split = pd.read_csv('SST2/datasetSplit.txt',sep=',',header=[0],index_col=0)
sst_split

sst_dataset = sst_sentences.join(sst_split)

sst_dataset
sst_train = sst_dataset[sst_dataset['splitset_label'] == 1]
sst_val = sst_dataset[sst_dataset['splitset_label'] == 3]
sst_test = sst_dataset[sst_dataset['splitset_label'] == 2]

sst_test
sst_test.to_csv('SST2/sst_test.csv', sep='\t')
sst_train.to_csv('SST2/sst_train.csv', sep='\t')
sst_val.to_csv('SST2/sst_val.csv', sep='\t')
import pandas as pd
sst_test = pd.read_csv('SST2/sst_test.csv', sep='\t')

import numpy as np
samples = pd.DataFrame(columns=['sentence_index','sentence','binary_representation','sample_sentence'])

ssamples = []
len(sst_test)
from tqdm.auto import tqdm
import spacy
nlp = spacy.load('en_core_web_lg')

cnt = 0
for i in tqdm(range(len(sst_test))):
    np.random.seed(42)
    text = sst_test.iloc[i]['sentence']
    doc = nlp(text)
    tokens = [token for token in doc]
    if len(doc) <= 10:
        for s in range(1<<len(doc)):
            sample = ""
            for j in range(len(doc)):
                if (s & (1<<j)):
                    sample += tokens[j].text + tokens[j].whitespace_
            sample = sample.strip()
            ssamples.append({'sentence_index':i, 'sentence':text, 'binary_representation':''.join(reversed(np.binary_repr(s,width=len(doc)))), 'sample_sentence':sample})
            
    else:
        binary_samples = []
        def add_sample(b_sample):
            b_sample = ''.join([str(i) for i in b_sample])
            if b_sample not in binary_samples:
                binary_samples.append(b_sample)
            else:
                return
            sample = ""
            for j in range(len(doc)):
                if b_sample[j] == '1':
                    sample += tokens[j].text + tokens[j].whitespace_
            sample = sample.strip()
            ssamples.append({'sentence_index':i, 'sentence':text, 'binary_representation':b_sample, 'sample_sentence':sample})
        for _ in range(1000):
            b_sample = np.random.choice(2, len(doc))
            add_sample(b_sample)
        b_sample = '1'*(len(doc))
        add_sample(b_sample)

        

samples = pd.DataFrame(ssamples)
samples.to_csv('SST2/sst_test_samples.csv', sep='\t')
samples.to_json('SST2/sst_test_samples.json', orient='records')
samples2 = pd.read_csv('SST2/sst_test_samples.csv', sep='\t', index_col=0)
samples
samples2
samples == samples2
all(samples == samples2)
import openai
import os
os.environ['no_proxy']
client = openai.OpenAI(base_url='http://localhost:30000/v1', api_key="None")

response = client.chat.completions.create(
    model="'LLM-Research/Meta-Llama-3.1-8B-Instruct",
    messages=[
        {
            "role": "system",
            "content": "You are a knowledgeable historian who provides concise responses.",
        },
        {"role": "user", "content": "Tell me about ancient Rome"},
        {
            "role": "assistant",
            "content": "Ancient Rome was a civilization centered in Italy.",
        },
        {"role": "user", "content": "What were their major achievements?"},
    ],
    temperature=0,  # Lower temperature for more focused responses
    max_tokens=1,  # Reasonable length for a concise response
    top_p=0.95,  # Slightly higher for better fluency
    presence_penalty=0.2,  # Mild penalty to avoid repetition
    frequency_penalty=0.2,  # Mild penalty for more natural language
    n=1,  # Single response is usually more stable
    seed=42,  # Keep for reproducibility
    logprobs=True,  # Logprobs are useful for debugging
    top_logprobs=3,  # Logprobs are useful for debugging
)

client.chat.completions
response.choices[0].logprobs.content[0]