"""
Code for generating dataset to make gpt4 generate diffs for patches with improved diff format
"""

from datasets import load_dataset, Dataset, DatasetDict, load_from_disk


# Load the dataset
# dataset = load_dataset("princeton-nlp/SWE-bench_Lite_oracle")
dataset = load_dataset("princeton-nlp/SWE-bench_Lite_bm25_27K")

new_diff_prompt = """
Please resolve the issue of the user (at the top, within <issue/> brackets) and implement a solution.
Present the solution as a diff (custom format, explained below).

The general format of a diff is as follows.
```custom-diff
diff
<path/filename>
< "rewrite" or "insert" >
< rough line number / EOF / BOF >
< insert function that should be added or rewritten >
end diff
< repeat blocks of diff as necessary >
```
Insertion can only be done at the end or beginning of the file, indicated by EOF or BOF respectively.

As an example for a diff, consider the following two versions of the same file, once before and once after a change.
The original version of the file was as follows.
[start of demo/test_file.py]
1 def test_euclidean(a, b):
2     assert euclidean(0, 0) == 0
3     assert euclidean(0, 1) == 1
4     assert euclidean(1, 0) == 1
5     assert euclidean(1, 1) == 1
6
7 @pytest.mark.parametrize("a, b, expected", [(0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 1)])
8 def test_gcd(a, b):
9     assert gcd(a, b) == expected
10
[end of demo/file.py]

The diff for fix in function euclidean and adds the function gcd is as follows.
This diff changes the first file into the second file.
```custom-diff
diff
demo/file.py
rewrite
1
def test_euclidean(a, b):
    assert euclidean(0, 0) == 0
    assert euclidean(0, 1) == 1
    assert euclidean(1, 0) == 1
    assert euclidean(1, 1) == 1
    assert euclidean(100, 10) == 10
end diff
diff
demo/file.py
insert
EOF
@ pytest.mark.parametrize("a, b, expected", [(0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 1), (100, 10, 10)])
def test_lcm(a, b):
    assert lcm(a, b) == expected
end diff
```

The new version of the file is as follows.
[start of demo/file.py]
1 def test_euclidean(a, b):
2     assert euclidean(0, 0) == 0
3     assert euclidean(0, 1) == 1
4     assert euclidean(1, 0) == 1
5     assert euclidean(1, 1) == 1
6     assert euclidean(100, 10) == 10
7
8 @pytest.mark.parametrize("a, b, expected", [(0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 1)])
9 def test_gcd(a, b):
10     assert gcd(a, b) == expected
11
12 @pytest.mark.parametrize("a, b, expected", [(0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 1), (100, 10, 10)])
13 def test_lcm(a, b):
14     assert lcm(a, b) == expected
15
[end of demo/file.py]

As you can see, you need to indicate the approximate line numbers, function name and the path and file name you want to change,
but there can be as many independent blocks of changes as you need. You may also apply changes to several files.
Apply as much reasoning as you please and see necessary. The format of the solution is fixed and has to follow the custom diff format.
"""

splits = {}
for split in dataset:
    count = 0
    num_new_funs = 0
    total_count = 0
    new_examples = []
    for i, example in enumerate(dataset[split]):
        files = []
        orig_text = example["text"].splitlines()
        new_text = orig_text
        new_text[0] = "The following text contains a user issue (in <issue/> brackets) posted at a repository. Further, you are provided with file contents of several files in the repository that contain relevant code (in <code> brackets). It may be necessary to use code from third party dependencies or files not contained in the attached documents however. Your task is to identify the issue and implement a solution by rewriting the correct functions. More details at the end of this text."
        line_of_diff_prompt = [i for i, l in enumerate(new_text) if l.startswith("</code>")][-1]+1
        new_text = new_text[:line_of_diff_prompt]
        new_text = "\n".join(new_text)
        new_text += new_diff_prompt
        new_example = {
            **example,
            "text": new_text,
        }
        new_examples.append(new_example)
    splits[split] = Dataset.from_list(new_examples)
ds = DatasetDict(splits)
ds.save_to_disk("./datasets/swe_bench_lite_aug1_bm25_cl27k")

