import json
import os

# Paths to the directories and files
train_dir = "train"
url_dir = ""
index_file_path = "laion-ocr-index-url-100000_seed42.txt"
output_file_path = "train/metadata.jsonl"

# Load the index file into a list, skipping empty lines
index_lines = []
with open(index_file_path, 'r') as index_file:
    index_lines = [line.strip() for line in index_file if line.strip()]

# Open the output file for writing
with open(output_file_path, 'w') as output_file:
    # Iterate over all JSON files in the train directory
    for filename in os.listdir(train_dir):
        if filename.endswith(".json"):
            json_path = os.path.join(train_dir, filename)

            # Read the JSON file to get the "key" attribute
            with open(json_path, 'r') as json_file:
                data = json.load(json_file)
                key_value = data.get("key", None)

                # If key exists and is valid, find the corresponding line in the index file
                if key_value is not None:
                    try:
                        # Convert key_value to an integer and get the corresponding line
                        line_index = int(key_value)  # Adjust for zero-based indexing
                        index_line = index_lines[line_index]
                        dir_subdir, _ = index_line.split(' ', 1)  # Extract dir_subdir
                        dir, subdir = dir_subdir.split('_', 1)  # Split dir_subdir into dir and subdir

                        # Construct the path to the info.json file
                        info_json_path = os.path.join(url_dir, dir, subdir, "ocr.txt")

                        # Read the caption from the info.json file
                        with open(info_json_path, 'r') as info_json_file:
                            first_column_words = ' '.join([line.split()[0] for line in info_json_file])

                            # Prepare the metadata entry
                            image_name = filename.replace(".json", ".jpg")
                            metadata_entry = {
                                "file_name": image_name,
                                "text": f'An image with text saying "{first_column_words}"',
                                "words": first_column_words,
                            }

                            # Write the metadata entry as a JSON line
                            output_file.write(json.dumps(metadata_entry) + "\n")
                    except (ValueError, IndexError, FileNotFoundError) as e:
                        print(f"Error processing file {filename}: {e}")

print(f"Metadata file has been created at {output_file_path}")
