import os
import json
import argparse
from tqdm import tqdm

def load_jsonl(file_path):
    return [json.loads(line) for line in open(os.path.join(file_path), 'r')]

def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f)
    f.close()

def main(args):
    data = load_jsonl(args.pypi_list_path)
    valid_repo = []
    for item in data:
        if item['github']:
            valid_repo.append(item['github'])

    # extract git repo from urls
    repo_names = []
    for url in tqdm(valid_repo):
        if "https://github.com/" not in url:
            continue
        else:
            try:
                url = url[url.index("https://github.com") + len("https://github.com"):]
                splitted = [x.strip() for x in url.split('/') if x.strip() != ""]
                repo_name = f"{splitted[0].strip()}/{splitted[1].strip()}"
                repo_names.append(repo_name)
            except Exception as e:
                print(f"Error while parsing git-repo name from {url}: skipping {url}")
    repo_names = sorted(list(set(repo_names)))
    print(f"Total number of valid repos: {len(repo_names)}")
    save_json(repo_names, args.save_file_name)
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--pypi_list_path', help="Path to the file of a list of PyPI packages", type=str, required=True)
    parser.add_argument('--save_file_name', help="Name of file to save a list of valid repos", type=str, default="valid_top_pypi_gitrepos.jsonl")

    args = parser.parse_args()
    main(args)