import os

source_dir = r'E:\2024\experiment_code_clone\total4\comment_embedding\comments\features\input_features'


def remove_duplicate_sentences(input_file):
    seen_sentences = set()  # 用于存储已看到的句子
    unique_sentences = []  # 用于存储去重后的句子

    with open(input_file, 'r', encoding='utf-8') as file:
        merge_next_line = False  # 是否合并下一行的标志

        for line in file:
            if merge_next_line:
                # 如果需要合并下一行，则将下一行的内容与当前行合并
                sentence += ' ' + line.strip()
                merge_next_line = False
                continue

            sentence = line.strip()
            if sentence:
                if sentence.endswith(('be', 'is', 'and', 'means','the','include','not','should','between','in')):
                    # 如果行末出现"be"、"is"或"and"，则需要合并下一行的内容
                    merge_next_line = True
                    continue

                sentence = sentence.replace("`", "'")
                sentence = sentence.lstrip('@')
                seen_sentences.add(sentence)
                unique_sentences.append(sentence)

    # 将去重后的句子写回到输出文件
    output_file = input_file
    with open(output_file, 'w', encoding='utf-8') as file:
        for sentence in unique_sentences:
            file.write(sentence + '\n')


for filename in os.listdir(source_dir):
    if filename.endswith('.txt'):
        input_file = os.path.join(source_dir, filename)
        remove_duplicate_sentences(input_file)
