import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from utils.seed import seed_everything
import fire


def csv_to_parquet(
    csv_path,
    chunksize=1_000_000,
    delimiter="\t",
    low_memory=False,
    save_name="GBIF",
    seed=42,
):
    seed_everything(seed)
    csv_stream = pd.read_csv(
        csv_path,
        sep=delimiter,
        chunksize=chunksize,
        low_memory=low_memory,
        usecols=[
            "scientificName",
            "decimalLatitude",
            "decimalLongitude",
            "year",
            "taxonKey",
            "order",
            "family",
            "genus",
            "species",
            "issue",
        ],
    )
    save_name = "data/occurance/" + save_name + ".parquet"
    for i, chunk in enumerate(csv_stream):
        print("Chunk", i)
        if i == 0:
            parquet_schema = pa.Table.from_pandas(df=chunk).schema
            parquet_writer = pq.ParquetWriter(
                save_name, parquet_schema, compression="snappy"
            )
        table = pa.Table.from_pandas(chunk, schema=parquet_schema)
        parquet_writer.write_table(table)
    parquet_writer.close()


if __name__ == "__main__":
    fire.Fire(csv_to_parquet)
