import json
import random
import nltk
from nltk.corpus import wordnet

# 下载nltk数据（如果没有安装nltk，请先执行nltk.download('wordnet')）
nltk.download('wordnet')

# 词汇替换（同义词替换）
'''def replace_word_with_synonym(word):
    synonyms = wordnet.synsets(word)
    if synonyms:
        # 选择一个同义词，避免选择和原词相同的词
        synonym = random.choice(synonyms).lemmas()[0].name()
        if synonym != word:
            return synonym
    return word'''

# 顺序调换
def shuffle_words(sentence, noise_ratio=0.13):
    words = sentence.split()
    num_words = len(words)
    num_changes = int(num_words * noise_ratio)
    for _ in range(num_changes):
        idx1, idx2 = random.sample(range(num_words), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# 插入噪声字符
def insert_noise(sentence, noise_ratio=0.13):
    noise_chars = ['@', '#', '$', '%', '&', '*', '!', '?', '1', '2', '3']
    words = sentence.split()
    num_words = len(words)
    num_changes = int(num_words * noise_ratio)
    for _ in range(num_changes):
        idx = random.randint(0, num_words - 1)
        word = words[idx]
        noise = random.choice(noise_chars)
        words[idx] = word + noise
    return ' '.join(words)

# 删除部分词语
def delete_words(sentence, noise_ratio=0.13):
    words = sentence.split()
    num_words = len(words)
    num_changes = int(num_words * noise_ratio)
    for _ in range(num_changes):
        idx = random.randint(0, num_words - 1)
        words[idx] = ''
    return ' '.join([word for word in words if word != ''])

# 对一条数据样本进行噪声处理
def apply_noise_to_sample(sample, noise_ratio=0.13):
    # 对 instruction 和 output 进行离散噪声处理
    if sample.get("instruction"):
        sample["instruction"] = shuffle_words(sample["instruction"], noise_ratio)
        sample["instruction"] = insert_noise(sample["instruction"], noise_ratio)
        sample["instruction"] = delete_words(sample["instruction"], noise_ratio)
        # sample["instruction"] = ' '.join([replace_word_with_synonym(word) for word in sample["instruction"].split()])

    if sample.get("output"):
        sample["output"] = shuffle_words(sample["output"], noise_ratio)
        sample["output"] = insert_noise(sample["output"], noise_ratio)
        sample["output"] = delete_words(sample["output"], noise_ratio)
        # sample["output"] = ' '.join([replace_word_with_synonym(word) for word in sample["output"].split()])

    return sample

# 读取JSON文件并对所有样本应用噪声
def apply_noise_to_dataset(input_file, output_file, noise_ratio=0.13):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    noisy_data = []
    for sample in data:
        noisy_sample = apply_noise_to_sample(sample, noise_ratio)
        noisy_data.append(noisy_sample)

    # 保存含噪声的JSON文件
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(noisy_data, f, ensure_ascii=False, indent=4)

# 示例运行
if __name__ == "__main__":
    input_file = r''  # 输入文件路径
    output_file = ''  # 输出文件路径
    apply_noise_to_dataset(input_file, output_file, noise_ratio=0.13)
