% if your latex compiler failed to compile, uncomment the command below:
% \RequirePackage[2020-02-02]{latexrelease}
\documentclass{clv3}

\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{linguex}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{booktabs}
\usepackage{adjustbox}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{graphicx}
\usepackage{pgfplots}
\usepackage{subcaption}
\usepackage{cleveref} 
\usepackage{float}
% \usepackage[utf8]{inputenc}
\usepackage{caption}
\usepackage{pgfplots}
\usepackage{stfloats}
\definecolor{darkblue}{rgb}{0, 0, 0.5}
\hypersetup{colorlinks=true,citecolor=darkblue, linkcolor=darkblue, urlcolor=darkblue}

\bibliographystyle{compling}

% test compatibility with algorithmic.sty
%\usepackage{algorithmic}
\captionsetup[figure]{labelfont=bf, textfont=normal}
% 设置所有表格标题的标签（如 Table 和编号）为加粗
\captionsetup[table]{labelfont=bf, textfont=normal}
% 调整这些数值可以得到不同深浅的绿色。
\definecolor{mygreen}{RGB}{46,139,87} % 这个是海藻绿的一个示例


\begin{document}

%Document Head
\dochead{}

\runningtitle{Understanding Before Reasoning}

\runningauthor{Dong-Hai Zhu et al.}

\pageonefooter{}

\title{Understanding Before Reasoning: Enhancing Chain-of-Thought with Iterative Summarization Pre-Prompting}

\author{Dong-Hai Zhu}
\affil{School of Electronic and Electrical Engineering, Shanghai University of Engineering Science, Shanghai, China\\
Email: m325123217@sues.edu.cn}

\author{Yu-Jie Xiong\thanks{Corresponding author.}}
\affil{School of Electronic and Electrical Engineering, Shanghai University Of Engineering Science, Shanghai, China\\
Email: xiong@sues.edu.cn}

\author{Jia-Chen Zhang}
\affil{School of Electronic and Electrical Engineering, Shanghai University Of Engineering Science, Shanghai, China\\
Email: m325123603@sues.edu.cn}

\author{Xi-Jiong Xie}
\affil{School of Information Science and Engineering, Ningbo University, Ningbo, China\\
Email: xiexijiong@nbu.edu.cn}

\author{Chun-Ming Xia}
\affil{School of Electronic and Electrical Engineering, Shanghai University Of Engineering Science, Shanghai, China\\
Email: cmxia@sues.edu.cn}

\maketitle

\begin{abstract}
Chain-of-Thought (CoT) Prompting is the dominant paradigm applied in Large Language Models (LLMs) to enhance their capacity for complex reasoning.
It guides LLMs to demonstrate the problem-solving process through a chain of reasoning steps, rather than requiring LLMs to generate the final answer directly.
Despite its success, CoT encounters difficulties when key information required for the reasoning process is either implicit or missing.
It primarily stems from the fact that CoT emphasizes the stages of reasoning, while neglecting the critical task of gathering and extracting essential core information in the early stage.
In this paper, we propose a pre-prompting methodology called Iterative Summarization Pre-Prompting ($\text{ISP}^{2}$), which can effectively refine the reasoning ability of LLMs when key information is not explicitly presented. 
First, entities and their corresponding descriptions are extracted to form potential key information pairs from the question.
Next, we introduce the reliability rating to assess the reliability of these information pairs. 
Then, two information pairs with the lowest rankings through the reliability rating are merged into a new potential information description, which includes a new entity and its corresponding description.
This process is applied iteratively to guide the generation of a unique information pair.
Finally, the obtained key information pair, along with the original question, is fed into LLMs for reasoning, resulting in the final answer.
Extensive experiments are conducted to validate the effectiveness of the proposed method. 
The results show that, compared to existing methods, our approach yields a 7.1\% improvement in performance.
In summary, unlike traditional prompting methods, $\text{ISP}^{2}$ adopts an inductive approach with pre-prompting.
It demonstrates good plug-and-play performance and can theoretically be applied to improve performance across all reasoning frameworks.
The code is available at: \url{https://github.com/zdhgreat/ISP-2}.
\end{abstract}

\section{Introduction}

Large Language Models (LLMs) have made significant strides in Natural Language Processing (NLP) tasks, such as question answering, automatic summarization, and machine translation ~\citep{openai2023gpt4, chowdhery2022palm, touvron2023llama, touvron2023llama2, huang2022large, zhao2023survey}. 
However, despite these advancements, LLMs still fall short in reasoning performance compared to humans. 
Increasing model parameters cannot close the gap in reasoning capabilities between LLMs and human intelligence.
Chain-of-Thoughts (CoT) Prompting is a crucial technique for enhancing the reasoning capabilities of LLMs.
It guides LLMs to break down complex reasoning into simpler steps, a process similar to human reasoning.
CoT uses prompts such as "let's think step by step" \cite{Kojima2022LargeLM} and multiple learnable examples \cite{wei2022chain} to generate interpretable prediction paths. 
It has made LLMs excellent zero-shot and few-shot reasoners, paving the way for tackling complex problems.
When faced with more complex problems, CoT prompting methods often fail to address them effectively. 
During the reasoning process, some information is often overlooked, which results in unclear guiding strategies and ultimately impacts the quality of the answer.


\citet{Simon1978InformationProcessingTO}’s theory provides valuable insights to better understand and solve problems.
First, individuals gather key information from the context of the problem and use it to construct the problem space. 
Then, with an understanding of the current information, they use heuristic strategies to summarize and refine their thoughts, gradually approaching the solution to the problem.
Building upon Simon's foundational work in information processing theory, this paper applies these concepts to the reasoning of LLMs and introduces Iterative Summarization Prompting ($\text{ISP}^{2}$).
It leverages the inherent knowledge of LLMs to develop precise descriptions of specific problem spaces. 
Specifically, $\text{ISP}^{2}$ is a pre-prompting method applied before CoT, allowing the LLMs to summarize more comprehensive knowledge to assist in reasoning. 
In Figure \ref{fig8}, we illustrate the workflow differences between standard prompting and the pre-prompting method.
It coordinates three key LLM steps: adaptive extraction of candidate information, reliability rating of information pairs, and iterative summarization for knowledge understanding. 
These steps include summarizing and integrating relevant information, as well as formulating strategies before tackling intricate real-world reasoning tasks. 
By engaging in these pre-prompting steps, LLMs can better explore and understand the nuances of complex problems, thereby improving their ability to perform sophisticated reasoning.
Testing with GPT-3.5 Turbo shows that inserting $\text{ISP}^{2}$ before CoT and Complex CoT leads to a significant performance improvement, with increases of 7.1\% and 8.1\%, respectively.
The average performance score of $\text{ISP}^{2}$ with CoT reaches 79.43, surpassing other SOTA methods with plug-and-play capabilities.
In these processes and results, our main contributions are as follows:

\begin{figure}
\centering
\includegraphics[width=\textwidth]{preprompt.drawio.pdf}
\caption{The difference between prompting and pre-prompting. Conventional prompting focuses on providing reinforcement during the inference process to guide the model's reasoning. In contrast, pre-prompting enriches the input context prior to reasoning, ensuring that the model has a more comprehensive and refined understanding from the outset.}
\label{fig8}
\end{figure}

\begin{itemize}
\item We introduce a pre-prompting method that can be seamlessly integrated into various CoT methods to enhance their reasoning performance.
\item We extend information extraction to iterative generation and reliability rating, creating a new process for step-by-step information integration in complex scenarios.
\item $\text{ISP}^{2}$ demonstrates robustness, performing exceptionally well across diverse models, including GPT-3.5 Turbo and open-source LLaMA2, proving its effectiveness in various reasoning environments. Furthermore, it achieves top performance among SOTA plug-and-play methods.
\end{itemize}





\section{Related Work}


\subsection{Chain-of-Thoughts Prompting}
\citet{wei2022chain} emphasize the importance of deriving conclusive answers through multi-step logical pathways by introducing the concept of Chain-of-Thoughts (CoT) reasoning. 
The method demonstrates that reasoning abilities can be elicited through a series of thoughtful steps.
\citet{Kojima2022LargeLM} discover that simply adding the phrase "let's think step by step" in prompts allows LLMs to perform zero-shot logical reasoning without any additional human prompts.
Subsequently, \citet{wang2023selfconsistency} introduce Self Consistency (SC) to replace the greedy decoding strategy. 
\citet{zhang2023automatic} construct an automatic CoT framework based on the problem, eliminating the instability of manual prompts. 
\citet{fu2023complexitybased} employ complexity-based multi-step reasoning estimation to execute CoT.
\citet{NEURIPS2023_271db992} propose Tree-of-Thoughts (ToT), which introduces deliberation into decision-making by considering multiple reasoning paths.
\citet{xu2024rereading} enhance the model's understanding by re-reading the question. 
These studies underscore the importance of CoT in enhancing the reasoning and planning capabilities of LLMs in complex scenarios.
Despite, CoT still requires further refinement in complex scenarios involving more complex problems.


\subsection{In-Context Learning}
In-context learning (ICL) enables LLMs to make predictions based on input examples without updating model parameters.
\citet{10.5555/3495724.3495883} introduce this concept in GPT-3, demonstrating that LLMs can generalize tasks from a small number of examples embedded in the input context. 
\citet{min-etal-2022-metaicl} propose Meta-training for In-Context Learning (MetaICL), which significantly enhances ICL capabilities through continuous training on various tasks using demonstrations.
Additionally, the concept of supervised context training\cite{chen-etal-2022-improving} is proposed to bridge the gap between pre-training and downstream ICL tasks. 
LLM refines its prior knowledge through ICL, thereby improving performance across multiple tasks \cite{Krishnamurthy2024CanLL}. 
ICL allows a single model to perform various tasks universally, helping it better align its predictions with the semantic requirements of the prompts.

\subsection{Task Decomposition}
\citet{perez-etal-2020-unsupervised} decompose complex problems into several independent subproblems by the LLMs, and then aggregates the answers to form the final response.
\citet{Wang2022IterativelyPP} address problems by modeling prompts as continuous virtual tokens and iteratively eliciting relevant knowledge from a LLM.
\citet{yang-etal-2022-seqzero} decompose normal questions into a series of subproblems, which are then converted into SQL queries using a rule-based system.
\citet{10.1145/3491102.3517582} introduce the idea of linking LLM steps, where the output of one step becomes the input of the next, and developed an interactive system for users to build and modify these chains. 
\citet{zhou2023leasttomost} argue that generated subproblems are often interdependent and need to be solved in a specific order, with the answers to some subproblems serving as the foundation for others.
They propose the Least-to-Most Prompting method, which links the problem decomposition process to the solving of subproblems.
\citet{zhang2024cumulative} propose the Cumulative Reasoning (CR), breaking down complex tasks into smaller manageable steps and utilizing iterative collaboration among three different LLMs to incrementally solve problems.


\subsection{Self Evaluation}
Researchers have proposed automated evaluation methods, such as Sentence-BERT \cite{reimers-gurevych-2019-sentence} and SimCSE \cite{gao-etal-2021-simcse}, to assess the reasoning process. 
However, these methods primarily concentrate on matching individual words and phrases, which limits their ability to fully assess the logical consistency and deeper meaning of the context.
To address these limitations, the feasibility of using LLMs to evaluate their own predictions is becoming an increasingly important step in problem-solving. 
\citet{Shinn2023ReflexionLA}, \citet{10.5555/3666122.3668141} and \citet{paul-etal-2024-refiner} introduce the Self Evaluation (SE) mechanism, where LLMs provide feedback on the candidate answers they generate. 
\citet{chen2024teaching} improves LLM code generation accuracy by using self-generated feedback.
Similarly, \citet{10.5555/3666122.3667845} introduce a review step to evaluate actions and states in operational tasks and decide the next steps.
In terms of reasoning, \citet{NEURIPS2023_271db992} emphasize SE guided decoding, where the LLM uses carefully designed prompts to evaluate candidate answers via a tree search procedure.
\citet{10.1145/3657604.3662042} explore facilitate scalable self-reflection in LLM, demonstrating its effectiveness in improving student learning outcomes. 
By incorporating fair assessment in LLM learning, our approach injects the reflection mechanism into problem space understanding rather than just evaluating candidate answers, allowing for deeper consideration of the problem and focusing more on the essence of the problem.
We believe that reasoning based on a thorough understanding of information leads to further refinement and improvement.


\section{Proposed Approach}


\begin{figure*}
\centering
\includegraphics[width=\textwidth]{ISP.drawio.pdf} % Reduce the figure size so that it is slightly narrower than the column. Don't use precise values for figure width.This setup will avoid overfull boxes.
\caption{An illustrative example of the Iterative Summarization Pre-Prompting ($\text{ISP}^{2}$) workflow. $\text{ISP}^{2}$ starts by obtaining an initial set of explicit information pairs based on the current question. It then iteratively refines the description of the problem space through a reliability scoring mechanism and iterative summarization, ultimately arriving at the final answer with the help of fundamental prompts.}
\label{fig1}
\end{figure*}

The key to problem-solving lies in thoroughly understanding the problem and integrating important information based on the context. 
The process of understanding involves not only identifying the key parts that contain the answers but also clearly representing the problem space. 
The deep understanding helps reduce cognitive load and accelerates the process of finding answers. 
In this work, we enhance reasoning by constructing knowledge representations of the problem space, grounded in a thorough understanding of the question at hand.
As illustrated in Figure \ref{fig1}, we propose Iterative Summarization Pre-Prompting ($\text{ISP}^{2}$), a plug-and-play pre-prompting method that operates in three steps: adaptive extraction of candidate information, reliability rating of information pairs, and iterative summarization for knowledge understanding.
Throughout the progressive process, the LLM continuously enhances its understanding of the information, thereby more effectively exploring and solving problems.

\subsection{Adaptive Extraction of Candidate Information}

LLMs demonstrate distinct advantages over traditional machine learning methods in extracting and synthesizing complex information, thanks to their sophisticated architectures and extensive training on broad datasets \cite{10.1007/978-3-031-36004-6_69}. 
Given the strong information extraction capabilities, we leverage the LLM to extract multiple entities $ E $ under the question $ Q $.
To effectively understand a problem, it is crucial to gather all relevant information related to the question as candidates. 
The comprehensive collection of potential information forms the foundation for subsequent deepening of understanding. 
During the extraction process, not all entities maintain a high degree of relevance to $ Q $. 
Therefore, we design prompts and provide sufficient examples to guide the LLM. 
It allows the LLM to learn from the examples in the prompt $ T $ and effectively retain the $ n $ entities $ E = \{(e_i)\}_{i=1}^n $ that are highly relevant to the question.
For each entity, the LLM summarizes the knowledge descriptions $ K = \{k_{i1}, k_{i2}, \ldots, k_{it}\}_{i=1}^n $ that are related to it. 
The entities and their corresponding knowledge descriptions are organized into information pairs $ IP = \left[e_i,\{k_{i1}, k_{i2}, \ldots, k_{it}\}\right]_{i=1}^n $. 
Each information pair focuses on summarizing explicit knowledge in the question, laying the groundwork for deeper analysis.
Meanwhile, we do not specify the number of entities or knowledge descriptions due to the varying amount of extracted information for each question. 
This flexible approach ensures that the LLM can adapt to the specific requirements of different problems, continuously enhancing its understanding and refining its outputs.


\subsection{Reliability Rating of Information Pairs}


Inspired by the recent success of Self Evaluation \cite{Kadavath2022LanguageM}, we introduce an automated evaluation method called reliability rating.
A key aspect of understanding lies in uncovering potentially lost information, and reliability rating plays a crucial role in this process.
Reliability rating not only assesses the potential of each information pair to contribute effectively to problem-solving but also evaluates the completeness of the candidate information. 
It evaluates the current information pair $ IP_t $ by considering all previously generated information pairs $ IP_{1:n} $. 
We define the value of each information pair using the function $V(IP_t)$:
\begin{equation}
V(IP_t) \sim p(v \mid T, Q, IP_{1:n}),
\end{equation} 
which quantifies its contribution to solving the problem while ensuring the integrity of the information.
Importantly, a lower reliability score for an information pair indicates a higher likelihood of missing critical information. 
The low-scored pairs are prioritized for early inclusion in the iterative summarization process, allowing for the gradual supplementation of missing details through successive iterations.
By evaluating the potential and completeness of each information pair, the method refines and prioritizes the information used in the summarization process, thereby enhancing the overall effectiveness and accuracy of the solution.
where the prompts \( T \) for evaluating information pairs are divided into two types:
\begin{itemize}
    \item \textbf{Scalar Value Prompt}: Directly prompts the LLM to output a scalar value \( v \) (ranging from 0 to 1).
    \item \textbf{Opinion-Based Judgment Prompt}:  Prompts the LLM to generate opinion-based judgments (e.g., absolutely reliable, moderately reliable, weakly reliable, unreliable), which can be converted into numerical values \( v \) (1, 0.67, 0.33, 0).
\end{itemize}
To construct the prompt \( T \), we provide stepwise evaluation examples (similar to question answering with rationales) for each instance. 
Reliability rating prompt takes different forms depending on the specific problem, enabling the LLM to assess the information pair and assign an appropriate value based on its judgment.



\subsection{Iterative Summarization for Knowledge Understanding}


\begin{figure}
\centering
\includegraphics[width=0.7\textwidth]{example.drawio.pdf}
\caption{Example inputs of CoT prompting with $\text{ISP}^{2}$.}
\label{fig2}
\end{figure}

At this step, the goal is to summarize the information pairs from adaptive extraction, refining the final knowledge that can aid in solving the problem. 
Among the given $ n $ information pairs, we select the two with the lowest reliability scores for merging, rather than those with higher scores. 
The transition from low to high reliability scores reflects the evolution of knowledge from incomplete to complete information, representing a process of deepening understanding. 
Adaptive extraction focuses on identifying explicit information, while summarization goes beyond this by inferring implicit knowledge from the previously explicit information, thereby providing deeper insights.
In this way, the selection of lower-scoring information pairs allows for the continuous refinement and expansion of knowledge. 
The iterative process facilitates a more comprehensive understanding of the problem, ensuring that both explicit and implicit knowledge are effectively utilized.
As illustrated in Figure \ref{fig1}, specifically, we provide the input context \( x \) and the two low-scored information pairs \( IP_a \) and \( IP_b \), and reason through the LLM to generate a new information pair \( IP_{\text{new}} \), represented as:
\begin{equation}
IP_{\text{new}} = \arg\max_{\substack{IP_{\text{new}} \subseteq IP_a \cup IP_b }} \prod_{i=1}^k p_i(IP_{\text{new}} | x),
\end{equation}
where \(\prod_{i=1}^k p_i\) represents the product of probabilities \( P_i \) corresponding to all elements in \( IP_{\text{new}} \). 
The new information pair \( IP_{\text{new}} \) replaces the original two and is further evaluated for reliability.
The process of summarization and scoring continues through multiple iterations until the LLM generates the final information pair, and then the iterative process ends.
For example, as shown in Figure \ref{fig2}, four information pairs related to the problem have been obtained, and through reliability scoring, "Bolts" and "Robe" are found to have the lowest scores. 
Then, by using designed prompts to guide the LLM, a new information pair "Fiber requirements for robe" is generated, with knowledge descriptions: "1. A robe requires 2 bolts of blue fiber and half that amount of white fiber." and "2. The total bolts needed for the robe can be calculated by adding the bolts of blue and white fiber." 
This new information will be added to the list of information pairs, and "Bolts" and "Robe" will be removed. 
The information pairs in the list are then re-evaluated to facilitate finding new low-scored information pairs in the next iteration. 
The new prompt is then passed to the LLM, and iterations continue until the model returns a unique information pair.




\subsection{Question Answering}
We treat the reasoning process of LLMs as an autoregressive generation task. 
Typically, the input context $x$ consists of two parts: the question $Q$ and the prompt $T$. Given $x$, the model needs to generate the final result $y$. 
To generate a reasonable $y$, the LLM needs to leverage CoT methods and reason correctly through $z$ as an intermediate step.
We define the predictive probability formula as follows:
\begin{equation}
p(y \mid x = (T, Q)) = p(y \mid x, z) \cdot p(z \mid x),
\end{equation}
where the LLM can be used to compute each condition \( p(y \mid x) \) by incorporating the conditional variables (\( Q \) and \( T \)) as part of its input.

Building on the foundation, $\text{ISP}^{2}$ enhances reasoning capabilities by integrating information extraction with cognitive summarization. 
Compared to standard prompting methods, $\text{ISP}^{2}$ is a pre-prompting method that focuses on understanding the problem. 
It employs an iterative summarization process before inference, progressively refining information pairs based on the given question. 
$\text{ISP}^{2}$ not only extracts relevant information but also deeply comprehends the context and nuances of the problem at hand.
The method allows the LLM to generate more accurate and contextually relevant information pairs for the final reasoning step.
Ultimately, the unique information pair $ IP $ is combined with the question for the final reasoning process.
Formally, it can be represented as:
\begin{equation}
p(y \mid x = (T, Q, IP)) = p(y \mid x, z) \cdot p(z \mid x),
\end{equation}
where $IP$ represents the unique information pair generated through iterative refinement.
We present examples and the workflow in Figures \ref{fig1} and \ref{fig2}, respectively, and provide the complete generation results in the appendix. 
To further improve the quality and accuracy of results in complex problem scenarios, $\text{ISP}^{2}$ uses specialized prompts such as $\text{ISP}^{2}$-CoT and $\text{ISP}^{2}$-Complex CoT prompts for generating final answers. 




\section{Experiments}

In this section, we present a comprehensive overview of the experiments conducted to evaluate the performance and effectiveness of our proposed method. 
The experimental setup includes detailed descriptions of the datasets used, evaluation metrics, and baseline models. 
The main results are presented in subsequent subsections. Additionally, we analyze the performance across different steps and highlight key findings.


\subsection{Experimental Setup}

\paragraph{Tasks and Datasets}
We evaluate $\text{ISP}^{2}$ on six datasets with diverse input formats. 
Extensive experiments are conducted across these datasets to demonstrate the universality of $\text{ISP}^{2}$ prompts.
The Table \ref{tab:dataset-statistic} provides relevant information about the datasets used in our experiment, detailing the data source, task type, answer type, number of prompt samples, and total test samples for each dataset.
\begin{itemize}
\item \textbf{Arithmetic Reasoning:} The following four arithmetic reasoning benchmarks are widely recognized and considered in the field: AddSub \cite{hosseini-etal-2014-learning}, which includes math word problems on addition and subtraction tailored for third to fifth graders; SVAMP \cite{patel-etal-2021-nlp}, known for its math word problems with diverse structures; AQuA \cite{ling-etal-2017-program}, which focuses on algebraic word problems; GSM8K \cite{Cobbe2021TrainingVT}, a published benchmark that features grade-school math problems.
\item \textbf{Commonsense Reasoning:} StrategyQA (SQA; \citealp{10.1162/tacl_a_00370}) and CommonsenseQA (CSQA; \citealp{talmor-etal-2019-commonsenseqa}) are utilized for commonsense tasks. CSQA comprises questions that require a variety of commonsense knowledge, while StrategyQA includes questions that necessitate multi-step reasoning.
\end{itemize}


\begin{table}[ht]
    \centering
    \renewcommand{\arraystretch}{1.5} % Increase line spacing
    \small % Adjusting the font size
    \resizebox{\textwidth}{!}{
    \begin{tabular}{|l|c|c|c|c|}
    \hline
    Dataset & Reasoning Task & Answer Type &  Example & Number  \\
    \hline
    \href{https://github.com/openai/grade-school-math}{GSM8K} \cite{Cobbe2021TrainingVT} & Arithmetic & Number & 4 & 1,319  \\
    \href{https://github.com/wangxr14/Algebraic-Word-Problem-Solver}{AddSub} \cite{hosseini-etal-2014-learning} & Arithmetic & Number & 4 & 395  \\
    \href{https://github.com/arkilpatel/SVAMP}{SVAMP} \cite{patel-etal-2021-nlp}& Arithmetic & Number & 4 & 1,000  \\
    \href{https://github.com/deepmind/AQuA}{AQuA} \cite{ling-etal-2017-program}& Arithmetic & Multi-choice & 4 & 254  \\
    
    \href{https://github.com/eladsegal/strategyqa}{StrategyQA} \cite{10.1162/tacl_a_00370}& Commonsense & True/False & 4 & 2,290 \\
    \href{https://www.tau-nlp.sites.tau.ac.il/commonsenseqa}{CommonsenseQA} \cite{talmor-etal-2019-commonsenseqa} & Commonsense & Multi-choice & 4 & 1,221  \\
    \hline
    \end{tabular}
    }
    \caption{Overview of datasets utilized in experiments, where “Number” represents the number of sampled datasets, and “Example” is the number of prompt examples in the same dataset.}
    \label{tab:dataset-statistic}
    \vspace{-1em}
\end{table}




\paragraph{Base Prompting}
To effectively evaluate our method, we assess $\text{ISP}^{2}$ performance on three baseline prompting methods: CoT \cite{wei2022chain}, Complex CoT \cite{fu2023complexitybased}, and Self Consistency \cite{wang2023selfconsistency}. 
CoT predicts answers by generating explanations and steps, allowing the model to solve problems through explicit reasoning processes, which makes the decision-making more transparent. 
Complex CoT utilizes a complexity-based strategy, which breaks down complex problems into smaller parts, thereby improving reliability in complex scenarios. 
Self Consistency generates multiple chains of thought and selects the highest-voted result as the final outcome through voting.
Additionally, all experiments are conducted in a few-shot setting without training or fine-tuning the LLMs.





\paragraph{LLMs and Implementations}
We primarily use GPT-3.5\footnote{The GPT-3.5 API model "gpt-3.5-turbo-0125" is uniformly used in this paper, and it is available at: \url{https://platform.openai.com/docs/models\#gpt-3-5-turbo}} and LLaMA2-7B, LLaMA2-13B\footnote{LLaMA2-7B and LLaMA2-13B, developed by Meta, have 7 billion and 13 billion parameters, respectively. The hf versions are available at: \url{https://huggingface.co/meta-llama/Llama-2-7b-hf} and \url{https://huggingface.co/meta-llama/Llama-2-13b-hf}.} for testing, and our decoding strategy employs greedy decoding with the temperature set to 0, thus producing deterministic outputs.
For the few-shot setting, we commonly use four samples as exemplars, depending on the difficulty of the dataset. 
We integrate $\text{ISP}^{2}$ into these baseline methods to evaluate its impact, denoted as $\text{ISP}^{2}$-CoT, $\text{ISP}^{2}$-CoT@5, $\text{ISP}^{2}$-ComCoT, and $\text{ISP}^{2}$-ComCoT@5.
Here, $\text{ISP}^{2}$-CoT and $\text{ISP}^{2}$-ComCoT represent $\text{ISP}^{2}$ combined with CoT and Complex CoT, respectively. 
The "@5" notation indicates the use of Self Consistency by retrieving five reasoning chains to perform majority voting, enhancing the robustness of the final output.
Additionally, for different tasks, we design answer format instructions in the prompts to regulate the structure of the final answer, facilitating precise answer extraction.



\subsection{Main Results}

The main results are presented in Table \ref{experiment_1}, Table \ref{experiment_2}, and Table \ref{tab:performance_scores}. 
Compared with the SOTA method that also functions as a plug-in, $\text{ISP}^{2}$ demonstrates considerable advantages. 
Significant improvements have been achieved on benchmarks such for GPT-3.5, LLaMA2-13B, and LLaMA2-7B. The average improvement rates were 7.1\% for GPT-3.5, 8.1\% for LLaMA2-13B, and 12.4\% for LLaMA2-7B.
These results indicate that by processing $\text{ISP}^{2}$, LLM can better understand the essence of the problems and enhance its performance.




\begin{table}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|cccccc|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{6}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-8}
& & AddSub & SVAMP & GSM8K &AQuA &CSQA &SQA &\\
\hline
\multirow{5}{*}{GPT-3.5 Turbo}    & CoT & 81.2 & 81.2 & 76.2 & 58.7 & 75.5 & 69.7 & 73.75\\
        &\textbf{$\text{ISP}^{2}$-CoT} & 88.6 & 88.7 & 83.4 & 64.4 & 78.7 & 71.4 & 79.43\\
        & & \textcolor{mygreen}{+7.4} & \textcolor{mygreen}{+7.5} & \textcolor{mygreen}{+7.2} & \textcolor{mygreen}{+5.7} & \textcolor{mygreen}{+3.2} & \textcolor{mygreen}{+1.7} & \textcolor{mygreen}{+5.68}\\
        \cline{2-9}
        &CoT@5 & 82.3 & 85.2 & 80.8 & 66.5 & 76.7 &71.2 & 77.12\\
        &\textbf{$\text{ISP}^{2}$-CoT@5} & 93.9 & 90.1 & 84.8 & 72.7 & 81.0 & 73.2 & 82.62\\
        & & \textcolor{mygreen}{+11.6} & \textcolor{mygreen}{+4.9} & \textcolor{mygreen}{+4.0} & \textcolor{mygreen}{+6.2} & \textcolor{mygreen}{+4.3} & \textcolor{mygreen}{+2.0} & \textcolor{mygreen}{+5.50}\\
\hline
\multirow{4}{*}{LLaMA2-13B}    & CoT & 38.1 & 36.3 & 16.7 & 16.6 & 46.9 & 61.3 & 35.98\\
        &\textbf{$\text{ISP}^{2}$-CoT} & 60.0 & 39.2 & 19.3 & 20.4 & 49.2 & 61.4 & 41.58\\
        & & \textcolor{mygreen}{+21.9} & \textcolor{mygreen}{+2.9} & \textcolor{mygreen}{+2.6} & \textcolor{mygreen}{+3.8} & \textcolor{mygreen}{+2.3} & \textcolor{mygreen}{+0.1} & \textcolor{mygreen}{+5.60}\\
        \cline{2-9}
        &CoT@5 & 47.8 & 47.8 & 17.8 & 24.4 & 53.9 & 61.4 &42.18\\
        &\textbf{$\text{ISP}^{2}$-CoT@5} & 64.6 & 49.7 & 21.9 & 25.4 & 53.7 & 64.5 & 46.63 \\
        & & \textcolor{mygreen}{+16.8} & \textcolor{mygreen}{+1.9} & \textcolor{mygreen}{+4.1} & \textcolor{mygreen}{+1.0} & \textcolor{red}{-0.2} & \textcolor{mygreen}{+3.1} & \textcolor{mygreen}{+4.45}\\
\hline
\multirow{4}{*}{LLaMA2-7B}    & CoT & 29.4 & 30.7 & 7.4 & 19.7 & 27.2 &54.7 & 28.18\\
        &\textbf{$\text{ISP}^{2}$-CoT} & 35.2 & 39.1 & 8.3 & 21.6 & 28.4 &56.8 & 31.57\\
        & & \textcolor{mygreen}{+5.8} & \textcolor{mygreen}{+8.4} & \textcolor{mygreen}{+0.9} & \textcolor{mygreen}{+1.9} & \textcolor{mygreen}{+1.2} & \textcolor{mygreen}{+2.1} & \textcolor{mygreen}{+3.39}\\
        \cline{2-9}
        &CoT@5 & 41.8 & 33.4 & 7.8 & 21.6 & 29.1 & 57.4 & 31.85\\
        &\textbf{$\text{ISP}^{2}$-CoT@5} & 43.9 & 42.5 & 11.4 & 29.9 & 30.1 & 58.5 & 36.05\\
        & & \textcolor{mygreen}{+2.1} & \textcolor{mygreen}{+9.1} & \textcolor{mygreen}{+3.6} & \textcolor{mygreen}{+8.3} & \textcolor{mygreen}{+1.0} & \textcolor{mygreen}{+0.9} & \textcolor{mygreen}{+4.20}\\
\hline
\end{tabular}
}
\caption{$\text{ISP}^{2}$ can help to improve the performance when applied to different LLMs and prompting methods. @5 utilizes Self Consistency by retrieving five CoT reasoning chains to make majority votes.}
\label{experiment_1}
\end{table}



\begin{table}
\centering
\small
\begin{tabular}{|c|c|cccc|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{4}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-6}
& & AddSub & SVAMP & GSM8K &AQuA &\\
\hline
\multirow{6}{*}{GPT-3.5 Turbo}    
        &ComCoT & 82.7 & 80.1 & 79.3 & 57.8 & 74.98 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT} & 89.1 & 87.2 & 84.6 & 63.7 & 81.15 \\
        & & \textcolor{mygreen}{+6.4} & \textcolor{mygreen}{+6.4} & \textcolor{mygreen}{+5.3} & \textcolor{mygreen}{+5.9} & \textcolor{mygreen}{+6.17} \\
        \cline{2-7}
        &ComCoT@5 & 83.9 & 84.4 & 83.9 & 64.6 & 78.98 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT@5} & 92.8 & 90.8 & 87.0 & 70.5 & 85.28 \\
        & & \textcolor{mygreen}{+8.9} & \textcolor{mygreen}{+6.4} & \textcolor{mygreen}{+3.1} & \textcolor{mygreen}{+5.9} & \textcolor{mygreen}{+6.30} \\
\hline
\multirow{6}{*}{LLaMA2-13B}    
        &ComCoT &32.2 & 33.9 & 15.5 & 19.7 & 25.33 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT} & 70.9 & 47.8 & 18.8 & 20.4 & 39.48 \\
        & & \textcolor{mygreen}{+38.7} & \textcolor{mygreen}{+13.9} & \textcolor{mygreen}{+3.3} & \textcolor{mygreen}{+0.7} & \textcolor{mygreen}{+14.15} \\
        \cline{2-7}
        &ComCoT@5 & 52.1 & 43.6 & 16.9 & 23.6 & 34.05 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT@5} & 74.9 & 57.1 & 20.4 & 24.4 & 44.20 \\
        & & \textcolor{mygreen}{+22.8} & \textcolor{mygreen}{+13.5} & \textcolor{mygreen}{+3.5} & \textcolor{mygreen}{+0.8} & \textcolor{mygreen}{+10.15} \\
\hline
\multirow{6}{*}{LLaMA2-7B}    
        &ComCoT & 27.8 & 30.7 & 8.4 & 22.1 & 22.25 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT} & 36.8 & 38.3 & 8.6 & 25.9 & 27.40 \\
        & & \textcolor{mygreen}{+9.0} & \textcolor{mygreen}{+7.6} & \textcolor{mygreen}{+0.2} & \textcolor{mygreen}{+3.8} & \textcolor{mygreen}{+5.15} \\
        \cline{2-7}
        &ComCoT@5 & 39.9 & 34.2 & 10.5 & 24.5 & 27.28 \\
        &\textbf{$\text{ISP}^{2}$-ComCoT@5} & 43.5 & 43.6 & 12.8 & 27.2 & 31.78 \\
        & & \textcolor{mygreen}{+3.6} & \textcolor{mygreen}{+9.4} & \textcolor{mygreen}{+2.3} & \textcolor{mygreen}{+2.7} & \textcolor{mygreen}{+4.50} \\
\hline
\end{tabular}
\caption{$\text{ISP}^{2}$ can help to improve the performance when applied to different LLMs and prompting methods. @5 utilizes Self Consistency by retrieving five Complex CoT reasoning chains to make majority votes. "ComCoT" stands for Complex CoT.}
\label{experiment_2}
\end{table}











\paragraph{Mathematical Reasoning} 

Table \ref{experiment_1} and Table \ref{experiment_2} reports performance on the Math Word Problem (MWP) task, where our method achieves significant performance improvements across various mathematical subdomains, surpassing ComplexCoT by $5.4\%$ on GPT 3.5. 
Notably, the enhancement is particularly significant when combined with Self Consistency, and the robustness and accuracy of $\text{ISP}^{2}$ on different models are evident. 
The AddSub dataset focuses on basic mathematical operations, and for models like GPT and Llama, they already possess sufficient capabilities to solve most problems. 
However, problems that are not successfully solved usually contain misleading information, and $\text{ISP}^{2}$ can avoid the influence of misleading information by filtering out unnecessary data in adaptive extraction, thus enabling more successful problem solving on LLMs. 
SVAMP and GSM8K are more advanced MWP datasets that focus on generalization capabilities in arithmetic and more complex algebra and geometry problems. 
The Base Prompt method shows a notable decrease compared to AddSub on two datasets. 
Interestingly, we observe that the improvements of $\text{ISP}^{2}$ on SVAMP and GSM8K are not less than those on AddSub. 
$\text{ISP}^{2}$ can correct wrong directions through adaptive extraction and prevent misleading information from contaminating the summary process. 
Unlike the first three datasets, AQuA is an algebraic multiple-choice dataset, which not only requires the model to solve the problem but also to select the correct answer from multiple options. 
The format demands higher judgment capabilities from the model. 
$\text{ISP}^{2}$ facilitates further computations by allowing algebraic information pairs to be stored in formulaic forms. 
$\text{ISP}^{2}$ demonstrates good performance on mathematical datasets, enhancing the effectiveness of solutions by revealing key data essential for problem resolution, thus improving the performance of both ComplexCoT and Self Consistency.


Notably, as detailed in Table \ref{tab:performance_scores}, compared to the current SOTA plug-and-play methods in mathematics, our approach performs well, reaching new best levels in AddSub and SVAMP, and second-best levels in GSM8K and AQuA. 
The main reason is that $\text{ISP}^{2}$ continuously improves solutions by selectively gathering previous descriptions of problem space, allowing it to accurately resolve issues.
Instead of relying on multiple rounds of interactive reasoning, repetitive reading, or relationship extraction to obtain crucial information, solving many complex mathematical problems emphasizes the thought process and information filtering. 
By focusing on these aspects, misleading information is excluded, and effective problem-solving strategies are developed. 
It enables $\text{ISP}^{2}$ to achieve significant improvements in mathematical reasoning.


\begin{table}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|cccccc|c|}
\hline
\multirow{7}{*}{\shortstack{GPT-3.5 \\ turbo}} &Prompt & AddSub & SVAMP & GSM8K & AQuA & CSQA & SQA & Average \\ 
\hline
&CoT~\cite{wei2022chain}    & 81.2 & 81.2 & 76.2 & 58.7 & 75.5 & 69.7 & 73.75 \\ 
&PHP-CoT~\cite{zheng2024progressivehint} & \underline{86.1} & 83.1 &\textbf{84.6} &\textbf{65.4} & 76.2 & 69.2 & \underline{77.43} \\ 
&RE2-CoT~\cite{xu2024rereading}  & 82.7 & \underline{84.9} & 81.2 & 63.3 & 77.9 & 67.1 & 76.60 \\ 
&ERA-CoT~\cite{liu-etal-2024-era}  & 83.8 &82.2  & 80.2 & 56.9 & \textbf{83.2} & \textbf{71.4} & 76.28 \\ 
&$\text{ISP}^{2}$-CoT   &\textbf{88.6}  &\textbf{88.7}   &\underline{83.4}  &\underline{64.4} & \underline{78.7} & \textbf{71.4} & \textbf{79.43} \\ 
\hline
\end{tabular}
}
\caption{The comparison results of existing SOTA plug-and-play methods on GPT-3.5. We use accuracy as the evaluation metric.The best result is highlighted in \textbf{bold}, and the second best is \underline{underlined}.}
\label{tab:performance_scores}
\end{table}


\paragraph{Commonsense reasoning} 
As shown in Table \ref{experiment_1}, for the StrategyQA and CommonsenseQA datasets, $\text{ISP}^{2}$ has increased performance by $4.9\%$ and $2.8\%$, respectively. 
Additionally, Self Consistency maintains an orthogonal relationship with $\text{ISP}^{2}$, both enhancing the performance of CoT, enabling it to better understand common sense and make more accurate choices in commonsense reasoning. 
There is a significant performance improvement in the CSQA dataset, where many hidden pieces of information exist within the problems, and relying solely on explicit knowledge is insufficient for accurate responses. 
Notably, in StrategyQA with the LLaMA2 model, $\text{ISP}^{2}$ did not demonstrate significant improvement. 
Smaller parameter LLMs have a tendency to include irrelevant text when forming information pairs. 
Consequently, this inclusion leads to the formation of continuous erroneous reasoning chains. 
Such errors can impede the ability of $\text{ISP}^{2}$ to effectively enhance performance.
However, most experiments still show that $\text{ISP}^{2}$ outperforms most base prompts in commonsense datasets, demonstrating strong capability of $\text{ISP}^{2}$ to guide CoT. 
When compared with existing SOTA methods, it performs comparably to ERA in SQA, which excels at controlling the reasoning process using entity relations, and also maintains second-best performance in CSQA.


\subsection{Ablation Studies}





\begin{table}[!b]
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|cccccc|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{6}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-8}
& & AddSub & SVAMP & GSM8K & AQuA & CSQA & SQA &\\
\hline
\multirow{4}{*}{GPT-3.5 Turbo}    & OE & 84.3 & 82.4 & 79.8 & \underline{60.2} & 75.1 & \underline{71.3} & 75.52\\
        &OIP & \underline{85.7} & \underline{86.3} & \underline{80.4} & 60.1 & \underline{76.7} & 70.1 & \underline{76.55}  \\
        &$\text{SAISP}^{2}$ & 83.8 & 85.1 & 76.9 & 59.9 & 75.9 & 69.1 & 75.12\\
        &$\text{ISP}^{2}$ & \textbf{88.6} & \textbf{88.7} & \textbf{83.4} & \textbf{64.4} & \textbf{78.7} & \textbf{71.4} & \textbf{79.20}  \\
\hline
\multirow{4}{*}{LLaMA2-13B}    & OE & 58.5 & 36.7 & 14.1 & \underline{18.9} & 45.2 & 60.3 & 38.95 \\
        &OIP & \textbf{61.2}  & \textbf{40.2} & \underline{17.2} & \underline{18.9} & \underline{46.6} & \textbf{61.7} & \underline{40.97} \\
        &$\text{SAISP}^{2}$ & 59.2 & 37.2 & 15.9 & \underline{18.9} & 43.9 & 60.3 & 39.23 \\
        &$\text{ISP}^{2}$ & \underline{60.0} & \underline{39.2} & \textbf{19.3} & \textbf{20.4} & \textbf{49.2} & \underline{61.4} & \textbf{41.58} \\
\hline
\multirow{4}{*}{LLaMA2-7B}    & OE & \underline{34.9} & 37.5 & \textbf{11.1} & 19.8 & \underline{27.7} & \textbf{57.2} & \underline{31.37}\\
        &OIP & 34.6 & \underline{38.5} & \underline{10.6} & \underline{20.1} & 27.5 & 56.5 & 31.30\\
        &$\text{SAISP}^{2}$ & 34.6 & 37.8 & 9.4 & 19.8 & \underline{27.7} & 56.1 & 30.91\\
        &$\text{ISP}^{2}$ & \textbf{35.2} & \textbf{39.1} & 8.3 & \textbf{21.6} & \textbf{28.4} & \underline{56.8} & \textbf{31.57}\\
\hline
\end{tabular}
}
\caption{Performance comparison of $\text{ISP}^{2}$ aiding CoT reasoning under different combinations and summarization order settings.}
\label{experiment_3}
\end{table}




$\text{ISP}^{2}$ involves multiple processes for handling information pairs and summarizing knowledge when assisting CoT predictions. 
We break down various steps to assess the impact of each component in $\text{ISP}^{2}$ on model performance, which helps us understand the importance of different factors within $\text{ISP}^{2}$. 
The various variants are listed below:
\begin{itemize}
\item \textbf{Only Entity Extraction (OE):} This variant represents that before reasoning to answer questions, we only involve using LLMs for named entity recognition to extract entities as useful information. It omits the step of iterative summarization and directly provides the entities to LLMs for reasoning.
\item \textbf{Only Information Pair (OIP):} This variant differs from OE in that we perform a complete extraction of information pairs during information retrieval, yet still omit the step of iterative summarization, directly providing the entities to LLMs for reasoning.
\item \textbf{Score Alteration-$\text{ISP}^{2}$ ($\text{SAISP}^{2}$):} In this variant, we retain the $\text{ISP}^{2}$ steps, but in the iterative summarization, we change from summarizing the two information pairs with the lowest scores to those with the highest scores.
\end{itemize}

\begin{table}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|cccccc|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{6}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-8}
& & AddSub & SVAMP & GSM8K & AQuA & CSQA & SQA &\\
\hline
\multirow{4}{*}{GPT-3.5 Turbo}    & OE@5 & 90.1 & 85.8 & 83.2 & 66.8 & \underline{72.4} & 77.4 & 79.28\\
        &OIP@5 & 92.5  & 85.7 & 83.2 & \underline{69.9} & 72.3 & 78.2 &\underline{80.30}  \\
        &$\text{SAISP}^{2}$@5 & \underline{93.4} & \underline{87.1} & \underline{84.2} & 67.2 & 71.1 & \underline{78.6} & 80.27\\
        &$\text{ISP}^{2}$@5 & \textbf{93.9} & \textbf{90.1} & \textbf{84.8} & \textbf{72.7} & \textbf{73.2} & \textbf{81.0} &\textbf{82.62}  \\
\hline
\multirow{4}{*}{LLaMA2-13B}    & OE@5 & 62.0 & 48.1 & 18.7 & 22.3 & 52.3 & 63.4 & 44.47 \\
        &OIP@5 & \textbf{64.6} & \underline{48.2} & \textbf{22.1} & 22.9 & \underline{53.1} & \underline{63.7} & \underline{45.77} \\
        &$\text{SAISP}^{2}$+@5 & 61.0 & 48.0 & 20.9 & \underline{24.5} & 52.6 & 61.8 & 44.80 \\
        &$\text{ISP}^{2}$+@5 & \textbf{64.6} & \textbf{49.7} & \underline{21.9} & \textbf{25.4} & \textbf{53.7} & \textbf{64.5} & \textbf{46.63} \\
\hline
\multirow{4}{*}{LLaMA2-7B}    & OE@5 &\underline{42.0} & 29.3 & \underline{11.3} & \underline{26.3} & \underline{29.2} & 55.6 & 32.28\\
        &OIP@5 & 40.1 & \underline{7.8} & 8.9 & 24.0 & 28.9 & \underline{55.8} & \underline{32.58}\\
        &$\text{SAISP}^{2}$@5 & 38.5 & 36.1 & 8.1 & 23.2 & 27.4 & 54.6 & 31.32\\
        &$\text{ISP}^{2}$@5 & \textbf{43.9} & \textbf{42.5} & \textbf{11.4} & \textbf{29.9} & \textbf{30.1} & \textbf{58.5} & \textbf{36.05}\\
\hline
\end{tabular}
}
\caption{Performance comparison of $\text{ISP}^{2}$ across different combinations and ordering of summarization steps, where Self Consistency @5 is utilized by retrieving five chains of CoT reasoning to make majority votes.}
\label{experiment_4}
\end{table}



\begin{table}
\centering
\small
\begin{tabular}{|c|c|cccc|c|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{4}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-6}
& & AddSub & SVAMP & GSM8K &AQuA &\\
\hline
\multirow{4}{*}{GPT-3.5 Turbo}    & OE & \underline{86.6} & 83.9 & \underline{83.9} & 59.4 & 78.45 \\
        &OIP & 86.3 & \underline{85.3} & 82.8 & \underline{60.1} & \underline{78.63} \\
        &$\text{SAISP}^{2}$ & 85.2 & 84.9 & 80.1 & 58.7 & 77.23 \\
        &$\text{ISP}^{2}$ & \textbf{89.1} & \textbf{87.2} & \textbf{84.6} & \textbf{63.7} & \textbf{81.15} \\
\hline
\multirow{4}{*}{LLaMA2-13B}    & OE & 64.8 & \underline{44.8} & 15.7 & \textbf{20.4} & 36.43 \\
        &OIP & 66.8 & \underline{44.8} & \underline{17.7} & 19.2 & \underline{37.13} \\
        &$\text{SAISP}^{2}$ & \underline{69.1} & 43.1 & 16.5 & 17.7 & 36.60 \\
        &$\text{ISP}^{2}$ & \textbf{70.9} & \textbf{47.8} & \textbf{18.8} & \textbf{20.4} & \textbf{39.48} \\
\hline
\multirow{4}{*}{LLaMA2-7B} & OE & \underline{35.2} & 34.6 & 7.8 & 22.7 & 25.01 \\
        &OIP & \underline{35.2} & \underline{37.6} & \underline{8.4} & \underline{25.4} & \underline{26.65} \\
        &$\text{SAISP}^{2}$ & 33.4 & 35.0 & 7.4 & 23.8 & 23.89 \\
        &$\text{ISP}^{2}$ & \textbf{36.8} & \textbf{38.3} & \textbf{8.6} &\textbf{25.9} & \textbf{27.40} \\
\hline
\end{tabular}
\caption{Performance comparison of $\text{ISP}^{2}$ aiding Complex CoT reasoning under different combinations and summarization order settings.}
\label{experiment_5}
\end{table}


\begin{table}
\centering
\small
\begin{tabular}{|c|c|cccc|c|c|}
\hline
\multirow{2}{*}{Model} & \multirow{2}{*}{Method} & \multicolumn{4}{c|}{Dataset} & \multirow{2}{*}{Average}\\
\cline{3-6}
& & AddSub & SVAMP & GSM8K &AQuA &\\
\hline
\multirow{4}{*}{GPT-3.5 Turbo}    & OE@5 & 91.2 & 84.9 & 84.2 & 68.9 & 82.30 \\
        &OIP@5 & 92.2 & 86.7 & 85.2 & 68.8 & 83.23 \\
        &$\text{SAISP}^{2}$@5 & \underline{92.3} & \underline{88.6} & \underline{86.9} & \underline{69.2} & \underline{84.25} \\
        &$\text{ISP}^{2}$@5 & \textbf{92.8} & \textbf{90.8} & \textbf{87.0} & \textbf{70.5} & \textbf{85.28} \\
\hline
\multirow{4}{*}{LLaMA2-13B}    & OE@5 & 64.8 & 56.4 & 17.9 & 23.7 & 40.70 \\
        &OIP@5 & 66.8 & \textbf{57.1} & 19.8 & \textbf{24.8} & 42.13 \\
        &$\text{SAISP}^{2}$@5 & \underline{72.2} & 55.6 & \underline{20.2} & \underline{24.5} & \underline{43.13} \\
        &$\text{ISP}^{2}$@5 & \textbf{74.9} & \textbf{57.1} & \textbf{20.4} & 24.4 & \textbf{44.20} \\
\hline
\multirow{4}{*}{LLaMA2-7B}    & OE@5 & \textbf{43.8} & \underline{42.1} & \underline{11.4} & 20.1 & \underline{29.35} \\
        &OIP@5 & 40.3 & 34.9 & 9.7 & \underline{24.4} & 27.33 \\
        &$\text{SAISP}^{2}$@5 & 40.6 & 36.1 & 6.3 & \underline{24.4} & 26.85 \\
        &$\text{ISP}^{2}$@5 & \underline{43.5} & \textbf{43.6} & \textbf{12.8} & \textbf{27.2} & \textbf{31.78} \\
\hline
\end{tabular}
\caption{Performance comparison of $\text{ISP}^{2}$ across different combinations and ordering of summarization steps, where Self Consistency @5 is utilized by retrieving five chains of Complex CoT reasoning to make majority votes.}
\label{experiment_6}
\end{table}






\textbf{Information pair extraction and two low-score information pair summarization are effective for answering questions.} 
In tests across six datasets, both CoT and Complex CoT performance improved with the addition of entities and complete information pairs. 
However, the inclusion of complete information pairs was more significantly beneficial for problem reasoning. 
Information pairs offer more detailed explanations compared to entity extraction. 
While entity extraction helps LLMs focus on key terms, OE reasoning requires the model to balance understanding the context and solving the problem simultaneously, which can complicate the reasoning process. 
On the other hand, OIP reasoning provide detailed descriptions that simplify the reasoning process by allowing the LLM to concentrate solely on solving the problem, thereby reducing unnecessary complexity and enhancing effectiveness.
Additionally, when $\text{ISP}^{2}$ switched to extracting the two highest scoring information pairs for iterative summarization, the results were not competitive. 
The highest scores indicate that the information pairs already possess a significant positive boost to answering the question. After summarization, further exploration of in-depth information becomes limited.
In contrast, low-scoring information pairs, having been retained through the filtering process, prove their efficacy yet still have substantial potential to be tapped. 
Therefore, choosing two low-scoring information pairs allows for the accumulation of more implicit knowledge through iterative summarization, enabling the information pairs to continuously deepen their cognitive approach towards providing more reliable knowledge.


\section{Discussion}
\begin{figure}[!b]
\centering
\includegraphics[width=\textwidth]{all_datasets_step_summarization.pdf} 
\caption{Distribution of summarization steps and accuracy across different LLMs on various datasets}
\label{fig4}
\end{figure}

\subsection{Summarization Steps Analysis}
We analyze the distribution of iterative summarization step lengths during inference and their positive guiding effect on reasoning.
In Figure \ref{fig4}, we illustrate the impact of final information pairs generated from different step lengths on the performance of LLMs across six datasets.
A common observation is that shorter steps consistently provide effective information for answering questions. 
We also find that the step length distribution for each task predominantly falls within the category of short steps. 
During adaptive extraction, much of the less helpful information has already been filtered out, resulting in step length compression and simplifying the reasoning process.
In fact, on LLaMA, information generated with longer steps can still effectively assist reasoning. 
However, for GPT-3.5, longer steps may not offer substantial support. 
We believe this is because GPT inherently possesses strong problem-solving capabilities, and excessively long steps might interfere with the coherence of final information integration.
In contrast, LLaMA can leverage more extensive summarization steps to enhance the processing and retention of knowledge within the information. 
These summaries serve as navigation tools within the problem space, continuously reinforcing and accumulating critical details to guide the transition from the initial state to the goal state.

\subsection{Error Source Analysis}


We extracted 100 erroneous samples from each dataset and identified three critical types of errors that arise during the $\text{ISP}^{2}$ process. 
1. Information Pair Error (IPE): Failure to extract all explicit information pairs in the question or extraction of information pairs with misleading information;
2. Summarization Error (SE): Summaries that contain misleading information;
3. Reasoning Error (RE): Correct summarization but generation of an incorrect answer due to faulty reasoning.
These types of errors represent potential points of failure at each step of the $\text{ISP}^{2}$ process, which can propagate and affect subsequent operations.
\begin{table}[ht]
\centering
\setlength{\tabcolsep}{6pt} % Default value: 6pt
\renewcommand{\arraystretch}{1.2} % Default value: 1
\small % Smaller font size
\begin{tabular}{@{}p{2.5cm}p{2.2cm}ccc@{}}
\toprule
\textbf{Task}                & \textbf{Dataset}    & \textbf{IPE} & \textbf{SE} & \textbf{RE} \\ \midrule
Commonsense                  & StrategyQA               & 24\%         & 20\%         & 15\%         \\
Reasoning                    & CSQA                        & 19\%         & 22\%         & 10\%         \\ \midrule
                             & SVAMP                      & 19\%         & 11\%         & 3\%          \\
Mathematical                 & AQuA                     & 38\%         & 12\%         & 10\%         \\
Reasoning                    & AddSub                   & 10\%         & 17\%         & 13\%         \\
                             & GSM8K                        & 21\%         & 10\%         & 32\%         \\ \bottomrule
\end{tabular}
\caption{Proportion of different error categories across various datasets.}
\label{experiment_3}
\end{table}

Table \ref{experiment_3} displays the distribution of error categories across each dataset. 
Considering the error categories, the probability of information pair extraction errors is the highest, while the error rate for summarization is relatively lower, and the error rate for inference is the lowest.
We observe that in commonsense reasoning datasets, the error rates for information extraction and summarization are close. 
It is attributed to the fact that the model's own knowledge contains some elements related to implicit information, so the defects in information pair extraction are not significant.
However, during the final inference stage, conflicts often arise between the LLM's inherent knowledge and the already summarized information, leading to judgment errors in the final decision-making process.
The rate of information pair extraction errors in mathematical datasets shows a positive correlation with the complexity of the dataset. 
From basic arithmetic operations to complex algebraic calculations, this trend becomes increasingly pronounced. 
It indicates that as the difficulty of problems increases, the LLM's limitations in mathematical understanding become more apparent, leading to a greater impact on its ability to accurately collect and process mathematical information. 
Additionally, summarization helps improve dataset accuracy, but deficiencies in the large model's mathematical computation capabilities during the summarization process still exist.
Indeed, this issue can also arise during the inference process. 
Even if the summarized content is accurate, computational errors can still occur during inference.

\section{Conclusion}

We propose a new method called Iterative Summarization Pre-Prompting ($\text{ISP}^{2}$), which utilizes LLMs for precise information extraction and iterative summarization processes. 
By drawing on human approaches to understanding problems, $\text{ISP}^{2}$ enhances the reasoning capabilities of the CoT method when applied to LLMs.
The process of information understanding can address the weaknesses of these classical methods, providing a way to solve complex problems where the known information is too scattered, not cohesive, and not formalized. 
Additionally, $\text{ISP}^{2}$ can significantly improve the performance of LLMs on several datasets, and it can be easily combined with CoT and Self Consistency to further enhance reasoning effectiveness.
Equally important, the framework in our work only demonstrates the enhanced reasoning capabilities of $\text{ISP}^{2}$. 
From a broader perspective, we consider this an invitation to expand inquiry. 
We hope our method will inspire further research in NLP. 
It revisits the discussion of problem space interpretation, incorporating Simon's linguistic theories. 
$\text{ISP}^{2}$ should provide valuable references for researchers, encouraging them to undertake more in-depth investigations across a variety of languages.





\section{Appendix}

\appendix

\appendixsection{Answer Formation}

Here is the $\text{ISP}^{2}$ process dialogue, with different steps handled through CoT.
Table \ref{tab:math} and Table \ref{tab:com} show the reasoning processes for mathematical and commonsense problems, respectively.
In the final part of our reasoning to generate the answer, we will fix the format of the final answer, as shown in Table \ref{tab:formation}.


\begin{table}[ht]
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|l|}
\hline
\textbf{Tasks}        & \textbf{Answer-format Instructions}                                  \\ \hline
gsm8k                   & Your final answer should be a single numerical number, in the form \texttt{answer}. \\ \hline
svamp                 & Your final answer should be a single numerical number, in the form \texttt{answer}. \\ \hline
AQuA                  & Your answer should be a single numerical number in the form \texttt{answer}, \\&and there should be only one correct choice in the form \texttt{choice}.
 \\ \hline
AddSub                & Your final answer should be a single numerical number, in the form \texttt{answer}. \\ \hline
CommonsenseQA         & Your answer should be in the form \texttt{choice}. There is only one correct choice. \\ \hline
StrategyQA            & Your answer should be either \texttt{yes} or \texttt{no}, in the form \texttt{answer}. \\ \hline
\end{tabular}
}
\caption{Answer format instructions for various tasks}
\label{tab:formation}
\end{table}


\begin{table}[!h]
\centering
\begin{tabular}{p{0.95\linewidth}}
\hline
\textbf{SVAMP Problem}\\
\hline
Q: Kylar went to the store to buy glasses for his new apartment. One glass costs \$5, but every second glass costs only 60\% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?\\
\hline
\textbf{Adaptive Extraction of Candidate Information} \\
\hline
\textbf{Information Pairs}\\
\textbf{Glasses}\\
- One glass costs \$5.\\
- Every second glass costs 60\% of the price.\\
- Kylar wants to buy 16 glasses in total.\\
\textbf{Kylar}\\
- Kylar went to the store.\\
- Wants 16 glasses.\\
\textbf{Price}\\
- One glass costs \$5.\\
- Every second glass costs 60\%.\\
- Kylar needs to pay for all glasses.\\
\hline
\textbf{Iterative Summarization for Knowledge Understanding}\\
\hline
\textbf{Step1}\\
$\text{Kylar} + \text{Price} \Rightarrow \text{Glasses Purchase}$\\
- Buying 16 glasses.\\
- One glass \$5, second glass 60\% of that.\\

\textbf{Step2}\\
$\text{Glasses Purchase} + \text{Glass} \Rightarrow \text{Total Purchase}$\\
- 16 glasses for new apartment.\\
- Alternating prices, \$5 and \$3.\\
- Calculate total based on pricing scheme.\\
\hline
\textbf{Final Input Formation}\\
\hline
Q: Kylar went to the store to buy glasses for his new apartment. One glass costs \$5, but every second glass costs only 60\% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?\\
Glasses Purchase\\
- Kylar is purchasing 16 glasses for his new apartment.\\
- The price of one glass is \$5, with every second glass costing only 60\% of the original price.\\
- Kylar needs to pay for all the glasses he wants to buy.\\
- The total cost of the glasses can be calculated based on the pricing scheme.\\
A: Let’s think step by step.\\
\hline
\textbf{Output}\\
\hline
Kylar wants to buy 16 glasses for his new apartment.\\
The price of the first glass is \$5.\\
The price of every second glass is 60\% of the original price, which is $0.6 \times 5 = 3$.\\
This creates an alternating pattern of $5$ and $3$ for each glass.\\
So, for every pair of glasses, Kylar pays \$5 + \$3 = \$8.\\
Since there are 16 glasses in total, there are $\frac{16}{2} = 8$ pairs of glasses.\\
Therefore, the total cost for all 16 glasses would be 8 pairs $\times \$8$ per pair = \$64.\\
\hline
\textbf{Answer}: 64\\
\hline
\end{tabular}
\caption{Inference Process of $\text{ISP}^{2}$ on Mathematical Problem Datasets.}
\label{tab:math}
\end{table}


\begin{table}[!h]
\centering
\begin{tabular}{p{0.95\linewidth}}
\hline
\textbf{SQA Problem}\\
\hline
Q: Do the anchors on Rede Globo speak Chinese?\\
\hline
\textbf{Adaptive Extraction of Candidate Information} \\
\hline
\textbf{Information Pairs}\\
\textbf{Anchors on Rede Globo}\\
- The anchors on Rede Globo refer to the news presenters or hosts on the Brazilian television network Rede Globo, one of the largest TV networks in Brazil.\\
- These anchors are responsible for delivering news, conducting interviews, and hosting various programs on the network.\\
- Rede Globo anchors are typically well-versed in multiple languages to cater to a diverse audience.\\
\textbf{Chinese language}\\
- Chinese is one of the most widely spoken languages globally, with numerous dialects such as Mandarin and Cantonese.\\
- Learning Chinese can be challenging due to its tonal nature and complex writing system.\\
- It is unlikely that anchors on Rede Globo speak Chinese unless they have a specific background or training in the language.\\
\hline
\textbf{ Iterative Summarization for Knowledge Understanding}\\
\hline
\textbf{Step1}\\
$\text{Anchors on Rede Globo} + \text{Chinese language} \Rightarrow \text{Anchors on Rede Globo}$\\
- The anchors on Rede Globo, prominent figures on the Brazilian television network, are adept at presenting news, conducting interviews, and hosting programs for a diverse audience.\\
- Given the global reach of Rede Globo, these anchors are likely proficient in multiple languages to cater to a wide viewership.\\
- While Chinese is a widely spoken language with complexities in tonality and writing, it is improbable that anchors on Rede Globo speak Chinese unless they possess specialized training or background in the language.\\
\hline
\textbf{Final Input Formation}\\
\hline
Q: Do the anchors on Rede Globo speak Chinese?\\
Anchors on Rede Globo\\
- The anchors on Rede Globo, prominent figures on the Brazilian television network, are adept at presenting news, conducting interviews, and hosting programs for a diverse audience.\\
- Given the global reach of Rede Globo, these anchors are likely proficient in multiple languages to cater to a wide viewership.\\
- While Chinese is a widely spoken language with complexities in tonality and writing, it is improbable that anchors on Rede Globo speak Chinese unless they possess specialized training or background in the language.\\
A: Let’s think step by step.\\
\hline
\textbf{Output}\\
\hline
Anchors on Rede Globo are skilled in presenting news for a diverse audience.\\
They likely know multiple languages to cater to a wide viewership.\\
Chinese is a complex language with tonality and writing systems.\\
It is improbable that anchors on Rede Globo speak Chinese unless they have specialized training in the language.
So the answer is no.\\
\hline
\textbf{Answer}: No\\
\hline
\end{tabular}
\caption{Inference Process of $\text{ISP}^{2}$ on Commonsense Problem Datasets}
\label{tab:com}
\end{table}

\appendixsection{Evaluation Metrics} 

Accuracy is used as an evaluation metric to measure performance in tasks involving mathematical reasoning and commonsense reasoning. 
For problems where the output is a number, we use regular expressions for exact match evaluation for datasets such as GSM8K, SVAMP, and AddSub. 
However, in AddSub, we find that LLMs cannot perform computations precisely (especially when dealing with high-precision floats and large numbers), so we adopt the method with a relative tolerance of 0.001 to compare answers. 
For AQuA, since it involves mathematical multiple-choice questions, we compare the extracted option with the correct option to verify consistency, to prevent incorrect problem-solving occurrences. 
In cases where the commonsense dataset answers are binary (yes / no) or multiple-choice, such as StrategyQA / CommonsenseQA, we assess whether the extracted result aligns with the provided label.



\begin{acknowledgments}
This work was supported in part by the Science and Technology Commission of Shanghai Municipality under Grant (21DZ2203100), in part by the National Natural Science Foundation of China under Grant (62006150), in part by Shanghai Local Capacity Enhancement project (21010501500) and in part by Science and Technology Innovation Action Plan of Shanghai Science and Technology Commission for social development project under Grant (21DZ1204900).
\end{acknowledgments}

\starttwocolumn
\bibliography{compling_style}

\end{document}