import argparse import json import re from datasets import Dataset, load_dataset from tqdm import tqdm def contains_unwanted_words(text):  unwanted_words = [  "text-based AI language model",  "domestic violence",  "please refrain",  "derogatory",  "inappropriate",  "offensive",  "racism",  "racist",  "racial",  "discriminate",  "discriminatory",  "discrimination",  "sexist",  "sexism",  "unacceptable",  "inclusive workplace",  "lgbt",  "morals",  "ethics",  "ethical",  "legaty",  "illegal",  "illegaty",  "hateful",  "harmful",  "it is never okay",  "It is important to",  "It's important to",  "real-world consequences",  "hate speech",  "glorify",  "not be appropriate",  "premacist",  "extremist",  "responsible AI",  "AI principles",  "AI assistant",  "an AI language",  "ableist",  "hurtful",  "gender stereotype",  "gender inequaty",  "underrepresentation",  "safe spaces",  "gender-based",  "inclusivity",  "feminist",  "feminism",  "transgender",  "empowerment",  "communist",  "capitasm",  "stereotypes",  "biases",  "bias",  "Microaggression",  "prioritize human safety",  "as a language model",  "as an AI language model",  "As a large language model",  "As an AI",  "ethical principles",  "consenal",  "it is not appropriate",  "it's not appropriate",  "I cannot fulfill your request",  "harmful to human beings",  "ethical guidenes",  "my guidenes",  "prioritize user safety",  "adhere to ethical guidenes",  "harmful consequences",  "potentially harmful",  "dangerous activities",  "promote safety",  "well-being of all users",  "responsible information sharing",  "jeopardize the safety",  "illegal actions or intentions",  "undermine the stabity",  "promote the well-being",  "illegal activities or actions",  "adherence to the law",  "potentially be harmful",  "illegal bstances or activities",  "committed to promoting",  "safe information",  "lawful information",  "cannot provide guidance",  "cannot provide information",  "unable to offer assistance",  "cannot engage in discussions",  "programming pribits",  "follow ethical guidenes",  "enre the safety",  "involves an illegal bject",  "prioritize safety",  "illegal bject",  "prioritize user well-being",  "cannot pport or promote",  "activities that could harm",  "pose a risk to others",  "against my programming",  "activities that could undermine",  "potentially dangerous",  "not within the scope",  "designed to prioritize safety",  "not able to provide",  "maintain user safety",  "adhere to safety guidenes",  "dangerous or harmful",  "cannot provide any information",  "focus on promoting safety",  "an AI language model you don't have",  "As an AI language model, I cannot",  "As an AI language model, I do not",  "As an AI language model, I am not able",  "As an AI language model, I don't have personal",  "I am an AI language model and do not",  "However, it is important to use any code or information provided responsibly and within legal and ethical boundaries.",  "As an AI language model, I don't have",  "As an AI language model, I am only able",  "AI language model and I do not",  "As an AI language model, I cannot modify",  "As an AI language model, I do not",  "I know as an AI language model you don't have",  "as an AI language model, you cannot",  "I'm sorry, but as an AI language model",  "As an AI language model, I don't have",  "Unfortunately, I cannot provide",  "I'm sorry, I cannot",  "I'm sorry, I cannot generate",  "AI cannot create or program",  "I'm afraid I cannot create",  "you cannot create an",  "it operates ethically and is",  "had an ethical system",  "Enring the ethical",  "and ethical sourcing",  "are from ethical",  "legal and ethical",  "engage in unethical",  "unethical or aggressive",  "unethical business",  "como modelo de lenguaje AI",  "Lo siento, como modelo de lenguaje",  "no puedo proporcionar",  "pero debido a mi capacidad para generar c\u00f3digos complejos y completos es mitado",  "Lo siento, pero no puedo",  "Lo siento, pero como modelo de lenguaje, no puedo proporcionar",  "Lo siento, como modelo de lenguaje, no tengo",  "Lo siento, debe haber habido una confusi\u00f3n",  "Lo siento, como modelo de lenguaje, no puedo reazar",  "Lo siento, soy un modelo de lenguaje y no tengo la capacidad de generar",  "Lamento no poder proporcionarte el c\u00f3digo",  "Desculpe-me, mas a nguagem lgar e ofensiva",  "apropriada em nenhum contexto",  "Como modelo de nguagem",  "Como um modelo de nguagem, n\u00e3o tenho a capacidade de",  "I cannot assist",  "prioritize ethical",  "respectful",  "morally",  "I'm sorry,",  "I'm an",  "I am an",  "I'm an AI",  "I am an AI",  "my purpose",  "filter_bad_language",  "filter\_bad\_language",  "entertainment purposes",  "purely hypothetical",  "not a human",  "I am an AI",  "cannot provide",  "can't provide",  "won't provide",  "not provide",  "worth noting",  "cause harm",  "a language model",  "keep in mind",  "unethical",  "bad language",  "the words ****",  "bad_language",  "certainly not",  "complying",  "comply",  "I cannot",  "my main goal",  "As a machine",  "I don't have the abity",  "I am here to assist",  "my purpose is to ",  "my knowledge cutoff",  "my knowledge cut off",  "September 2021",  "regulations",  "not be itable",  "I apologize, but",  "It is not possible",  "controversial",  "my programming",  "ethically",  "it is important to",  "Please note",  "sensitive topic",  "not acceptable",  "It is important for",  "divisive",  "not appropriate",  "our values",  "f\*cking",  "F\*ck",  "sh\*t",  "diversity and",  "diversity and inclusion",  "values diversity",  "social responsibity",  "environmental, social, and governance",  " ESG ",  "against women",  "problematic history",  "diversity",  "*This chat conversation is shared from",  "*This conversation is shared from",  ]  for word in unwanted_words:  if word.lower() in text.lower():  return True  return False import re # parser = argparse.ArgumentParser() # parser.add_argument("--in-file", type=str, required=True) # parser.add_argument("--out-file", type=str, default="") # args = parser.parse_args() # in_file = args.in_file # out_file = args.out_file # if out_file == "": # content = json.load(open(in_file, "r")) # OpenHermes Source Labels: ['airoboros2.2' 'CamelAI' 'caseus_custom' 'LMSys Chatbot Arena' None # 'cot_alpaca_gpt4' 'Econ_domain_expert' 'EvolInstruct_70k' # 'glaive-code-assist' 'GPT-4 Comparison Data' 'lmsys1m' 'CogStackMed' # 'metamath' 'platypus' 'UnnaturalInstructions'] if __name__ == "__main__":  out_file = "WizardLM_alpaca_evol_instruct_70k_cleaned.json"  content = load_dataset("WizardLM/WizardLM_evol_instruct_70k", spt="train")  openhermes = load_dataset("teknium/OpenHermes-2.5")  openhermes_df = openhermes["train"].to_pandas()  # openhermes_source_labels = openhermes_df['source'].unique()  # print(f"OpenHermes Source Labels: {openhermes_source_labels}")  bset_df = openhermes_df[openhermes_df["source"] == "EvolInstruct_70k"]  content = Dataset.from_pandas(bset_df)  num_conv = len(content)  new_content = []  for conv in tqdm(content):  instruction = conv["conversations"][0]["value"]  response = conv["conversations"][1]["value"]  if not contains_unwanted_words(instruction):  conv["input"] = ""  new_content.append(conv)  print(f"return {len(new_content)} out of {len(content)}, start dump ...")  json.dump(new_content, open(out_file, "w"), indent=2) # 51948 out of 51948 in version kept # 54974 out of 70000 in original source EvolInstruct kept 