{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 51,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Dynamic combination of English text\n",
        "import json\n",
        "target_size=147\n",
        "merged_paragraphs = []  \n",
        "current_paragraph = \"\" \n",
        "filename='chunk_onlytwo/qasper_prob_qwen05B_merge.json' \n",
        "with open('chunk_onlytwo/qasper_prob_qwen05B.json', 'r', encoding='utf-8') as file:  \n",
        "    paragraphs = json.load(file)\n",
        "for paragraph in paragraphs:  \n",
        "    # Check if adding a new paragraph to the current paragraph exceeds the target size\n",
        "    if len(current_paragraph.split()) + len(paragraph.split()) <= target_size:  \n",
        "        current_paragraph +=' '+paragraph  \n",
        "    else:  \n",
        "        merged_paragraphs.append(current_paragraph)  # Add the current merged paragraph to the result list \n",
        "        current_paragraph = paragraph  # Reset the current paragraph to a new paragraph\n",
        "if current_paragraph:  \n",
        "    merged_paragraphs.append(current_paragraph)  \n",
        "with open(filename, 'w') as file:\n",
        "    json.dump(merged_paragraphs, file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(125.23849910048831, 974606)"
            ]
          },
          "execution_count": 1,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# English paragraph length\n",
        "import json\n",
        "with open('chunk_semantic/2wikimqa_82.json', 'r', encoding='utf-8') as file:  \n",
        "    qa_data = json.load(file)\n",
        "len_sents=0\n",
        "len_lists=0\n",
        "for sentence in qa_data:\n",
        "    len_sents+=len(sentence.split())\n",
        "    len_lists+=1\n",
        "len_sents/len_lists,len_sents  "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 27,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Dynamic combination of Chinese text\n",
        "import json\n",
        "target_size=195\n",
        "merged_paragraphs = []  \n",
        "current_paragraph = \"\" \n",
        "filename='chunk_onlytwo/multifieldqa_zh_prob_qwen7B_merge.json' \n",
        "with open('chunk_onlytwo/multifieldqa_zh_prob_qwen7B.json', 'r', encoding='utf-8') as file:  \n",
        "    paragraphs = json.load(file)\n",
        "for paragraph in paragraphs:  \n",
        "    if len(current_paragraph) + len(paragraph) <= target_size:  \n",
        "        current_paragraph +=paragraph  \n",
        "    else:  \n",
        "        merged_paragraphs.append(current_paragraph)  \n",
        "        current_paragraph = paragraph   \n",
        "if current_paragraph:  \n",
        "    merged_paragraphs.append(current_paragraph)  \n",
        "with open(filename, 'w') as file:\n",
        "    json.dump(merged_paragraphs, file)  "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(82.73064162754304, 1321622)"
            ]
          },
          "execution_count": 31,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Chinese paragraph length\n",
        "import json\n",
        "with open('chunk_onlytwo/multifieldqa_zh_prob_qwen7B.json', 'r', encoding='utf-8') as file:  \n",
        "    qa_data = json.load(file)\n",
        "len_sents=0\n",
        "len_lists=0\n",
        "for sentence in qa_data:\n",
        "    len_sents+=len(sentence)\n",
        "    len_lists+=1\n",
        "len_sents/len_lists,len_sents  "
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
