{"cells":[{"cell_type":"code","execution_count":null,"id":"a656dd52-30ce-4f59-8ebc-ef8ced7fc641","metadata":{"id":"a656dd52-30ce-4f59-8ebc-ef8ced7fc641"},"outputs":[],"source":["import numpy as np\n","import pandas as pd"]},{"cell_type":"code","execution_count":null,"id":"2b282631-a16e-4daf-a02d-cbedf215cd8f","metadata":{"id":"2b282631-a16e-4daf-a02d-cbedf215cd8f"},"outputs":[],"source":["chunksize = 10**4 # Adjust this value depending on your available memory and dataset\n","chunks = []\n","\n","# Read and process the DataFrame in chunks\n","for chunk in pd.read_csv('newdata/tgbl-comment/tgbl-comment_edgelist.csv', chunksize=chunksize, nrows=1000000):\n","    # Drop duplicates in the chunk\n","    chunks.append(chunk.drop_duplicates(subset=['src','dst'], keep='first'))\n","\n","# Concatenate processed chunks\n","df = pd.concat(chunks)"]},{"cell_type":"code","execution_count":null,"id":"1d2424a3-8b5b-4fbd-a5c5-08e3774e05e4","metadata":{"id":"1d2424a3-8b5b-4fbd-a5c5-08e3774e05e4"},"outputs":[],"source":["dup_bool_mask = df.duplicated(subset=['src','dst'], keep='first')\n","df = df[~dup_bool_mask]"]},{"cell_type":"code","execution_count":null,"id":"f0c0c5e5-07b0-4fb2-a888-84c778736cff","metadata":{"id":"f0c0c5e5-07b0-4fb2-a888-84c778736cff","outputId":"44a313ac-96df-4b2c-d3d8-2f74577ac5e6"},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of unique values in the first column: 721699\n","Number of unique values in the second column: 70432\n","Number of unique values in the third column: 67277\n","Number of non-zero values in the first column: 731686\n","Number of non-zero values in the second column: 731686\n","Number of non-zero values in the third column: 731686\n"]}],"source":["unique_count_first_column = df.iloc[:, 0].nunique()\n","unique_count_second_column = df.iloc[:, 1].nunique()\n","unique_count_third_column = df.iloc[:, 2].nunique()\n","\n","print(f\"Number of unique values in the first column: {unique_count_first_column}\")\n","print(f\"Number of unique values in the second column: {unique_count_second_column}\")\n","print(f\"Number of unique values in the third column: {unique_count_third_column}\")\n","\n","nonzero_count_first_column = np.count_nonzero(df.iloc[:, 0])\n","nonzero_count_second_column = np.count_nonzero(df.iloc[:, 1])\n","nonzero_count_third_column = np.count_nonzero(df.iloc[:, 2])\n","\n","print(f\"Number of non-zero values in the first column: {nonzero_count_first_column}\")\n","print(f\"Number of non-zero values in the second column: {nonzero_count_second_column}\")\n","print(f\"Number of non-zero values in the third column: {nonzero_count_third_column}\")"]},{"cell_type":"code","execution_count":null,"id":"78a100b4-db23-47b1-a7d2-e52a4e27ced4","metadata":{"id":"78a100b4-db23-47b1-a7d2-e52a4e27ced4"},"outputs":[],"source":["\n","df.iloc[:, 4] = (df.iloc[:, 4] - df.iloc[:, 4].min()) / (df.iloc[:, 4].max() - df.iloc[:, 4].min())\n","df.iloc[:, 5] = (df.iloc[:, 5] - df.iloc[:, 5].min()) / (df.iloc[:, 5].max() - df.iloc[:, 5].min())"]},{"cell_type":"code","execution_count":null,"id":"5150583e-19d0-4ed6-9de5-b1a9b6a63fd4","metadata":{"id":"5150583e-19d0-4ed6-9de5-b1a9b6a63fd4"},"outputs":[],"source":["df['label']=np.zeros(len(df),dtype=int)\n","df.columns=[['src','dst','ts','label','subreddit','num_words','score']]\n","df['subreddit']=df['subreddit'].astype(str)\n","df.columns=['u','i','ts','label','f0','f1','f2']"]},{"cell_type":"code","execution_count":null,"id":"8c6d37bf-5278-4259-9478-c2ed4e4d8664","metadata":{"id":"8c6d37bf-5278-4259-9478-c2ed4e4d8664"},"outputs":[],"source":[]},{"cell_type":"code","execution_count":null,"id":"d16f3f3f-be2a-4cc3-962f-3d022652bf90","metadata":{"id":"d16f3f3f-be2a-4cc3-962f-3d022652bf90","outputId":"1ce51fc0-fe6d-4ea9-b6be-3d5754752b9a"},"outputs":[{"data":{"text/plain":["Index(['u', 'i', 'ts', 'label', 'f0', 'f1', 'f2'], dtype='object')"]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["df.columns"]},{"cell_type":"code","execution_count":null,"id":"14de3bab-9533-44b2-b95c-87ec266246df","metadata":{"id":"14de3bab-9533-44b2-b95c-87ec266246df"},"outputs":[],"source":["df['u'] = df['u'].astype('category')\n","\n","df['u'] = df['u'].cat.codes\n","\n","df['i'] = df['i'].astype('category')\n","\n","df['i'] = df['i'].cat.codes"]},{"cell_type":"code","execution_count":null,"id":"32f1220c-d094-4d43-b8cf-356456c0de7d","metadata":{"id":"32f1220c-d094-4d43-b8cf-356456c0de7d","outputId":"1b65c4bc-b936-4d2d-9afb-e117c1252011"},"outputs":[{"data":{"text/plain":["True"]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["df.iloc[:, 0].min() == 0"]},{"cell_type":"code","execution_count":null,"id":"8d8dbc4b-4890-4828-8d20-3f3719c4b742","metadata":{"id":"8d8dbc4b-4890-4828-8d20-3f3719c4b742","outputId":"d19ce6a2-8a99-49da-f560-fadfdade1673"},"outputs":[{"data":{"text/plain":["True"]},"execution_count":35,"metadata":{},"output_type":"execute_result"}],"source":["df.iloc[:, 1].max() + 1 == df.iloc[:, 1].nunique()"]},{"cell_type":"code","execution_count":null,"id":"566f71c5-afc3-4525-a387-2742f2b04ba6","metadata":{"scrolled":true,"id":"566f71c5-afc3-4525-a387-2742f2b04ba6"},"outputs":[],"source":["df.to_csv('reddit_v1.csv')"]},{"cell_type":"code","execution_count":null,"id":"4747b24f-0964-42be-b3a1-aeaa518664ac","metadata":{"id":"4747b24f-0964-42be-b3a1-aeaa518664ac"},"outputs":[],"source":[]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":5}