import os
import shutil

FILTER_WORD_FOR_REPHRASE = ["Rephra", "rephras", "重新表述", "重新阐述", "重新证明", "重述", "官方", "official", "原证明"]

class WordSearchBasedFilter():
    def __init__(self, search_dir: str):
        self.search_word = FILTER_WORD_FOR_REPHRASE
        self.search_dir = search_dir
        self.search_dict: None | dict[str, list[dict[str, str]]] = None
        self.searched_file = 0
        
    def find_word_in_file(self, word: str) -> tuple[list[dict[str, str]], int]:
        import os
        import json
        return_list = []
        searched_file = 0
        for file_name in os.listdir(self.search_dir):
            if file_name.endswith(".json"):
                searched_file += 1
                with open(os.path.join(self.search_dir, file_name), 'r', encoding='utf-8') as f:
                    content = json.load(f)
                    if word in content['new_solution']:
                        near_content = content['new_solution']
                        # Extract a snippet around the searched word
                        start_index = max(0, near_content.find(word) - 30)
                        end_index = min(len(near_content), near_content.find(word) + len(word) + 30)
                        # Ensure the snippet is not too long
                        if end_index - start_index > 60:
                            start_index = max(0, end_index - 60)
                        if start_index < 0:
                            start_index = 0
                        if end_index > len(near_content):
                            end_index = len(near_content)
                        # Get the content around the searched word  
                        near_content = near_content[start_index:end_index]
                        
                        return_list.append({
                            "file_name": file_name,
                            "content": content['new_solution'],
                            "near_content": near_content
                        })
        return return_list, searched_file
    
    def search(self):
        self.search_dict = {}
        for word in self.search_word:
            self.search_dict[word], searched_file = self.find_word_in_file(word)
            self.searched_file = searched_file

    def report(self, report_file: str | None = None):
        assert self.search_dict is not None
        
        if report_file is not None:
            file = open(report_file, 'w', encoding='utf-8')
        else:
            file = None
            
        print("Search results:", file=file)
        for word, results in self.search_dict.items():
            print("\n"+"*****" * 7 + "*" * (8 + len(word)) + "*****" * 7, file=file)
            print("*****" * 7 + f" Word: {word} " + "*****" * 7, file=file)
            print("*****" * 7 + "*" * (8 + len(word)) + "*****" * 7 + "\n", file=file)

            if results:
                for result in results:
                    print('='*30 + f" {result['file_name']} " + '='*30, file=file)
                    print(f"  Content: {result['near_content']}", file=file)
            else:
                print("  No occurrences found.", file=file)
        print(f"Search completed. We find totally {sum(len(v) for v in self.search_dict.values())} occurrences of the search words in total {self.searched_file} files.", file=file)
        
    def move(self, target_path: str):
        assert self.search_dict is not None
        os.makedirs(target_path, exist_ok=True)
        print(f"Moving files to {target_path}...")
        for word, results in self.search_dict.items():
            for result in results:
                orig_name = os.path.join(self.search_dir, result['file_name'])
                new_name = os.path.join(target_path, result['file_name'])
                try:
                    shutil.move(orig_name, new_name)
                except FileNotFoundError:
                    # This file could have been moved already.
                    continue
                print(f"Moved {orig_name} to {new_name}.")
        print('Done!')
                
if __name__ == "__main__":      
    filter_ = WordSearchBasedFilter("save/rephrase")
    filter_.search()
    filter_.report(report_file = 'tmp_rephrase_report.txt')
    filter_.move("save/rephrase_removed")