import pandas as pd
import csv
import os

def remove_adjacent_duplicates(sequence):
    if not sequence:
        return []
    result = [sequence[0]]
    for current in sequence[1:]:
        if current != result[-1]:
            result.append(current)
    return result

file_path = './data/mal_api_2019/mal_api_2019.txt'
lines = []
invalid_lines = []
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                invalid_lines.append((i, "Empty line"))
                continue
            sequence = line.split(' ')
            sequence = [api for api in sequence if api]
            if sequence:
                lines.append(sequence)
            else:
                invalid_lines.append((i, f"No valid APIs: {line[:50]}..."))
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue. Try 'latin1' or check file encoding.")
    exit(1)

total_lines = len(lines)
if invalid_lines:
    print("First 10 invalid lines:", invalid_lines[:10])

expected_samples = 7107
if total_lines < expected_samples:
    print(f"Error: Total valid lines ({total_lines}) less than expected samples ({expected_samples})")
    exit(1)
if total_lines > expected_samples:
    lines = lines[:expected_samples]

label_file_path = './data/mal_api_2019/mal_api_2019_lables.txt'
labels = []
invalid_labels = []
try:
    with open(label_file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            label = line.strip()
            if not label:
                invalid_labels.append((i, "Empty label"))
                continue
            labels.append(label)
except FileNotFoundError:
    print(f"Error: Label file not found at {label_file_path}")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: Label file encoding issue. Try 'latin1' or check file encoding.")
    exit(1)

total_labels = len(labels)

if invalid_labels:
    print("First 10 invalid labels:", invalid_labels[:10])

if total_labels < expected_samples:
    print(f"Error: Total valid labels ({total_labels}) less than expected samples ({expected_samples})")
    exit(1)
if total_labels > expected_samples:
    print(f"Warning: Total valid labels ({total_labels}) exceeds expected samples ({expected_samples}). "
          f"Using first {expected_samples} valid labels.")
    labels = labels[:expected_samples]

api_sequences = []
for i, sequence in enumerate(lines):
    sequence = remove_adjacent_duplicates(sequence)
    sequence = sequence[:100]
    api_sequences.append(sequence)

if len(api_sequences) != expected_samples:
    print(f"Error: Expected {expected_samples} API sequences, got {len(api_sequences)}")
    exit(1)

empty_sequences = [i for i, seq in enumerate(api_sequences) if not seq]
if empty_sequences:
    print(f"Warning: {len(empty_sequences)} empty sequences found at indices: {empty_sequences}")

csv_data = {
    'api_sequence': [','.join(seq) for seq in api_sequences],
    'label': labels
}

csv_df = pd.DataFrame(csv_data)

output_path = './data/mal_api_2019/mal_api_2019_raw_100.csv'
csv_df.to_csv(output_path, index=False, encoding='utf-8', quoting=csv.QUOTE_MINIMAL)
print(f"CSV saved to {output_path}")

csv_file_path = './data/mal_api_2019/mal_api_2019_raw_100.csv'
try:
    csv_df = pd.read_csv(csv_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {csv_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {csv_file_path}. Try 'latin1'.")
    exit(1)

expected_samples = 7107
total_samples = len(csv_df)
print(f"Total samples in CSV: {total_samples}")
if total_samples != expected_samples:
    print(f"Warning: Expected {expected_samples} samples, got {total_samples}")

expected_columns = ['api_sequence', 'label']
if not all(col in csv_df.columns for col in expected_columns):
    print(f"Error: CSV does not contain expected columns: {expected_columns}")
    exit(1)

unique_apis = set()
for sequence in csv_df['api_sequence']:
    if pd.isna(sequence):
        print("Warning: Found NaN in api_sequence")
        continue
    apis = sequence.split(',')
    unique_apis.update(api.lower() for api in apis)

unique_apis = sorted(list(unique_apis))
total_unique_apis = len(unique_apis)
print(f"Total unique APIs: {total_unique_apis}")
print(f"First 10 unique APIs: {unique_apis[:10]}")

if 'sekey' in unique_apis:
    print("Warning: Found 'sekey' in unique APIs")

api_307_file_path = './data/mal_api_2019/API_name_307.xlsx'
if not os.path.exists(api_307_file_path):
    print(f"Error: File not found at {api_307_file_path}")
    exit(1)

try:
    api_307_df = pd.read_excel(api_307_file_path, header=0, engine='openpyxl')
except FileNotFoundError:
    print(f"Error: File not found at {api_307_file_path}")
    exit(1)
except ImportError:
    print("Error: 'openpyxl' is not installed. Install it using 'pip install openpyxl'.")
    exit(1)
except ValueError as e:
    print(f"Error: Failed to read Excel file {api_307_file_path}. Details: {e}")
    exit(1)

expected_307_columns = 2
if api_307_df.shape[1] != expected_307_columns:
    print(f"Error: Expected {expected_307_columns} columns in API_name_307, got {api_307_df.shape[1]}")
    exit(1)

api_307_df.iloc[:, 0] = api_307_df.iloc[:, 0].apply(lambda x: str(x).lower() if pd.notna(x) else x)

try:
    api_307_df.to_excel(api_307_file_path, index=False, engine='openpyxl')
    print(f"API_name_307 updated with lowercase API names and saved to {api_307_file_path}")
except Exception as e:
    print(f"Error: Failed to save updated API_name_307 to {api_307_file_path}. Details: {e}")
    exit(1)

api_307 = api_307_df.iloc[:, 0].dropna().tolist()
total_api_307 = len(api_307)
expected_api_307 = 307

if total_api_307 != expected_api_307:
    print(f"Warning: Expected {expected_api_307} APIs in API_name_307, got {total_api_307}")

unique_apis_df = pd.DataFrame(unique_apis, columns=['API_Name'])
unique_apis_df['Data'] = None

for idx, api_name in unique_apis_df.iterrows():
    api_name_clean = api_name['API_Name']
    match = api_307_df[api_307_df.iloc[:, 0] == api_name_clean]
    if not match.empty:
        unique_apis_df.at[idx, 'Data'] = match.iloc[0, 1]

output_csv_path = './data/mal_api_2019/unique_apis_with_data_100.csv'
try:
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    unique_apis_df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"Unique APIs with data successfully saved to {output_csv_path}")
except Exception as e:
    print(f"Error: Failed to save unique APIs to {output_csv_path}. Details: {e}")
    exit(1)

api_307_set = set(api_307)
missing_apis = [api for api in unique_apis if api not in api_307_set]

if missing_apis:
    print("Missing APIs:", missing_apis)
else:
    print("All APIs in mal_api_2019_raw.csv are present in API_name_307")

#****************************************************************************************
#Remove samples with fewer than 100 APIs.
import pandas as pd
import os

csv_file_path = './data/mal_api_2019/mal_api_2019_raw_100.csv'
try:
    df = pd.read_csv(csv_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {csv_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {csv_file_path}. Try 'latin1'.")
    try:
        df = pd.read_csv(csv_file_path, encoding='latin1')
    except Exception as e:
        print(f"Error: Failed to read file with 'latin1' encoding. Details: {e}")
        exit(1)

expected_columns = ['api_sequence', 'label']
if not all(col in df.columns for col in expected_columns):
    print(f"Error: CSV does not contain expected columns: {expected_columns}")
    exit(1)

def count_apis(api_sequence):
    if pd.isna(api_sequence):
        return 0
    return len(api_sequence.split(','))

df['api_count'] = df['api_sequence'].apply(count_apis)
samples_less_than_100 = df[df['api_count'] < 100]
num_samples_less_than_100 = len(samples_less_than_100)

print(f"Number of samples with less than 100 APIs: {num_samples_less_than_100}")
if num_samples_less_than_100 > 0:
    print(f"Sample indices with less than 100 APIs: {samples_less_than_100.index.tolist()[:10]}")  # 打印前 10 个索引

filtered_df = df[df['api_count'] >= 100][['api_sequence', 'label']]
# print(f"Number of samples after filtering: {len(filtered_df)}")

output_csv_path = './data/mal_api_2019/mal_api_2019_filtered_100.csv'
try:
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    filtered_df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"Filtered data successfully saved to {output_csv_path}")
except Exception as e:
    print(f"Error: Failed to save filtered data to {output_csv_path}. Details: {e}")
    exit(1)

#********************************************************************************
import pandas as pd
import numpy as np
import os

csv_file_path = './data/mal_api_2019/mal_api_2019_filtered_100.csv'
try:
    raw_df = pd.read_csv(csv_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {csv_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {csv_file_path}. Try 'latin1'.")
    try:
        raw_df = pd.read_csv(csv_file_path, encoding='latin1')
    except Exception as e:
        print(f"Error: Failed to read file with 'latin1' encoding. Details: {e}")
        exit(1)

expected_samples = 5274
total_samples = len(raw_df)
print(f"Total samples in CSV: {total_samples}")
if total_samples != expected_samples:
    print(f"Warning: Expected {expected_samples} samples, got {total_samples}")

expected_columns = ['api_sequence', 'label']
if not all(col in raw_df.columns for col in expected_columns):
    print(f"Error: CSV does not contain expected columns: {expected_columns}")
    exit(1)

unique_apis_file_path = './data/mal_api_2019/unique_apis_with_data_100.csv'
try:
    unique_apis_df = pd.read_csv(unique_apis_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {unique_apis_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {unique_apis_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {unique_apis_file_path}. Try 'latin1'.")
    try:
        unique_apis_df = pd.read_csv(unique_apis_file_path, encoding='latin1')
    except Exception as e:
        print(f"Error: Failed to read file with 'latin1' encoding. Details: {e}")
        exit(1)

expected_unique_columns = ['API_Name', 'Data']
if not all(col in unique_apis_df.columns for col in expected_unique_columns):
    print(f"Error: CSV does not contain expected columns: {expected_unique_columns}")
    exit(1)

unique_apis_df['API_Name'] = unique_apis_df['API_Name'].apply(lambda x: str(x).strip().lower() if pd.notna(x) else '')
api_to_index = dict(zip(unique_apis_df['API_Name'], unique_apis_df['Data']))

num_rows = total_samples
num_apis_per_row = 100
output_matrix = np.empty((num_rows, num_apis_per_row + 1), dtype=object)
output_matrix.fill(None)

unmatched_apis = set()
for i in range(num_rows):
    api_sequence = raw_df['api_sequence'].iloc[i]
    label = raw_df['label'].iloc[i]

    if pd.isna(api_sequence):
        print(f"Warning: NaN found in api_sequence at row {i}")
        output_matrix[i, num_apis_per_row] = label
        continue

    apis = api_sequence.split(',')
    apis = [api.strip() for api in apis]
    if len(apis) != num_apis_per_row:
        print(f"Warning: Row {i} has {len(apis)} APIs, expected {num_apis_per_row}")
        output_matrix[i, num_apis_per_row] = label
        continue

    for j, api in enumerate(apis):
        api_clean = api.lower()
        if api_clean in api_to_index:
            output_matrix[i, j] = api_to_index[api_clean]
        else:
            unmatched_apis.add(api_clean)
            output_matrix[i, j] = None
            if i < 2:
                print(f"Warning: API '{api_clean}' at row {i}, position {j} not found in unique_apis_with_data.csv")

    output_matrix[i, num_apis_per_row] = label

columns = [f'API_{i+1}' for i in range(num_apis_per_row)] + ['label']
output_df = pd.DataFrame(output_matrix, columns=columns)

output_csv_path = './data/mal_api_2019/api_index_matrix_100.csv'
try:
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    output_df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"API index matrix successfully saved to {output_csv_path}")
except Exception as e:
    print(f"Error: Failed to save API index matrix to {output_csv_path}. Details: {e}")
    exit(1)

matrix_file_path = './data/mal_api_2019/api_index_matrix_100.csv'
try:
    matrix_df = pd.read_csv(matrix_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {matrix_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {matrix_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {matrix_file_path}. Try 'latin1'.")
    try:
        matrix_df = pd.read_csv(matrix_file_path, encoding='latin1')
    except Exception as e:
        print(f"Error: Failed to read file with 'latin1' encoding. Details: {e}")
        exit(1)

expected_samples = 5274
expected_columns = [f'API_{i+1}' for i in range(100)] + ['label']
total_samples = len(matrix_df)
print(f"Total samples in api_index_matrix_100.csv: {total_samples}")
if total_samples != expected_samples:
    print(f"Warning: Expected {expected_samples} samples, got {total_samples}")
if not all(col in matrix_df.columns for col in expected_columns):
    print(f"Error: CSV {matrix_file_path} does not contain expected columns: {expected_columns}")
    exit(1)

matrix_df['label'] = 1
print(f"Labels in api_index_matrix_100.csv replaced with 1")

dynamic_file_path = './data/dynamic_api_call_data/dynamic_api_call_sequence_20000.csv'
try:
    dynamic_df = pd.read_csv(dynamic_file_path, encoding='utf-8')
except FileNotFoundError:
    print(f"Error: File not found at {dynamic_file_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"Error: File {dynamic_file_path} is empty")
    exit(1)
except UnicodeDecodeError:
    print(f"Error: File encoding issue for {dynamic_file_path}. Try 'latin1'.")
    try:
        dynamic_df = pd.read_csv(dynamic_file_path, encoding='latin1')
    except Exception as e:
        print(f"Error: Failed to read file with 'latin1' encoding. Details: {e}")
        exit(1)

dynamic_expected_columns = [f't_{i}' for i in range(100)] + ['malware']
if not all(col in dynamic_df.columns for col in dynamic_expected_columns):
    print(f"Error: CSV {dynamic_file_path} does not contain expected columns: {dynamic_expected_columns}")
    exit(1)

column_mapping = {f't_{i}': f'API_{i+1}' for i in range(100)}
column_mapping['malware'] = 'label'
dynamic_df = dynamic_df.rename(columns=column_mapping)

malware_df = dynamic_df[dynamic_df['label'] == 0].copy()

merged_df = pd.concat([matrix_df, malware_df], ignore_index=True)

output_csv_path = './data/mal_api_2019/merged_api_index_data.csv'
try:
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    merged_df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"Merged data successfully saved to {output_csv_path}")
except Exception as e:
    print(f"Error: Failed to save merged data to {output_csv_path}. Details: {e}")
    exit(1)
