import json
from datasketch import MinHash
from hashlib import sha1

def minhash_deduplicate(texts, num_permutations=128):

    minhashes = []
    seen = set()
    for text in texts:
        words = set(text.split())  
        minhash = MinHash(num_perm=num_permutations)

        for word in words:
            minhash.update(sha1(word.encode('utf8')).digest())
        
        minhash_digest = bytes(minhash.digest())
        
        if minhash_digest not in seen:
            seen.add(minhash_digest)
            minhashes.append(text)
    
    return minhashes

def filter_short_texts(texts, min_length=5):

    return [text for text in texts if len(text) >= min_length]

def filter_and_deduplicate(texts, min_length=5, num_permutations=128):

    filtered_texts = filter_short_texts(texts, min_length)
    return minhash_deduplicate(filtered_texts, num_permutations)

with open('wikitext.json', 'r') as f:
    texts = [line.strip() for line in f.readlines()]
    
result = filter_and_deduplicate(texts, min_length=5, num_permutations=128)

with open("wikitext_new.json", "w") as f:
    json.dump(result, f, ensure_ascii=False, indent=4)
