% This must be in the first 5 lines to tell arXiv to use pdfLaTeX, which is strongly recommended.
\pdfoutput=1
% In particular, the hyperref package requires pdfLaTeX in order to break URLs across lines.

\documentclass[11pt]{article}

% Change "review" to "final" to generate the final (sometimes called camera-ready) version.
% Change to "preprint" to generate a non-anonymous version with page numbers.
\usepackage[final]{acl}

% Standard package includes
\usepackage{times}
\usepackage{latexsym}

% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
\usepackage[T1]{fontenc}
% For Vietnamese characters
% \usepackage[T5]{fontenc}
% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets

% This assumes your files are encoded as UTF8
\usepackage[utf8]{inputenc}

% This is not strictly necessary, and may be commented out,
% but it will improve the layout of the manuscript,
% and will typically save some space.
\usepackage{microtype}

% This is also not strictly necessary, and may be commented out.
% However, it will improve the aesthetics of text in
% the typewriter font.
\usepackage{inconsolata}

%------- Non template additions
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{tabularx}
\usepackage{cleveref}
\usepackage{tablefootnote}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{mdframed}
\usepackage{xcolor}
\def\statements/{\texttt{statements}}
\def\Statements/{\texttt{Statements}}
\def\statement/{\texttt{statement}}
\def\Statement/{\texttt{Statement}}

%==--------------------------------

% If the title and author information does not fit in the area allocated, uncomment the following
%
%\setlength\titlebox{<dim>}
%
% and set <dim> to something 5cm or larger.


\title{Statements: Universal Information Extraction from Tables
with Large Language Models for ESG KPIs}

\author{
 \textbf{Lokesh Mishra\textsuperscript{1}},
 \textbf{Sohayl Dhibi\textsuperscript{1}},
 \textbf{Yusik Kim\textsuperscript{2}},
 \\
 \textbf{Cesar Berrospi Ramis\textsuperscript{1}},
\textbf{Shubham Gupta\textsuperscript{2}},
 \textbf{Michele Dolfi\textsuperscript{1}},
 \textbf{Peter Staar\textsuperscript{1}},
\\
\\
 \textsuperscript{1}IBM Research Zurich, Säumerstrasse 4, Rüschlikon, Switzerland,
 \\
 \textsuperscript{2}IBM Research Paris-Saclay,  2 Rue d'Arsonval, Orsay, France
\\
\small \texttt{[mis, ceb, dol, taa]@zurich.ibm.com} 
\\
\small \texttt{[sohayl.dhibi, yusik.kim, shubham.gupta1]@ibm.com}
}

\begin{document}
\maketitle
\begin{abstract}
Environment, Social, and Governance (ESG) KPIs assess an organization's performance on issues such as climate change, greenhouse gas emissions, water consumption, waste management, human rights, diversity, and policies. ESG reports convey this valuable quantitative information through tables.Unfortunately, extracting this information is difficult due to high variability in the table structure as well as content. We propose \Statements/, a novel domain agnostic data-structure for extracting quantitative facts and related information. We propose translating tables to \statements/ as a new supervised deep-learning universal information extraction task. We introduce SemTabNet -- a dataset of over 100K annotated tables. Investigating a family of T5-based Statement Extraction Models, our best model generates \statements/ which are 82\% similar to the ground-truth (compared to baseline of $21\%$). We demonstrate the advantages of \statements/ by applying our model to over 2700 tables from ESG reports. The homogeneous nature of \statements/ permits exploratory data analysis on expansive information found in large collections of ESG reports.
\end{abstract}

\section{Introduction}
\label{sec:introduction}
It is invaluable to assess mankind's impact on climate. Climate change related information is often published in so-called  ``Environment, Social, and Governance (ESG)'' reports. Corporations report valuable quantitative data regarding their efforts to improve their impact on environment, working conditions, and company culture in these ESG reports \citep{bingler_how_2022, schimanski_bridging_2024}. 

Like most technical documents, ESG reports present their key information in tables, making table understanding and information extraction (IE) an important problem \citep{mishra_esg_2024}. This problem becomes further complicated due to the large variety and diversity of tabular representations used in these reports. Despite efforts to standardize these reports, this diversity makes the task of extracting information from these documents extremely challenging (see Appendix Fig. \ref{fig:complex_table} for an example table).

Large Language Models (LLMs) have turned out to be excellent tools for IE, due to their ability to parse, understand, and reason over textual data \citep{openai_gpt-4_2023, touvron_llama_2023}. This, in combination with their in-context learning ability, makes them excellent for IE from text \citep{brown_language_2020}. This approach breaks down when applying the same techniques on tables \citep{zhu_converting_2021}.

\begin{figure}[]
    \centering
    \includegraphics[width=\linewidth]{images/statement_knowledge_model_vertical.png}
    \caption{The knowledge model of \Statements/ represented as a tree. From the root node, individual statements emerge as branches. Associated with each individual statement node are the leaf predicate nodes.}
    \label{fig:knowledge_model}
\end{figure}

\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{images/text_table_to_statements.png}
    \caption{A diagram explaining the framework introduced in this paper. We fine-tune LLMs on the task of `Statement Extraction' leading to a family of ``\textbf{S}tatement \textbf{E}xtraction \textbf{M}odels'' (SEM). Quantitative facts are extracted from heterogenous unstructured data (only tables in this paper) and stored as Statements.}
    \label{fig:text_table_to_statements}
\end{figure*}

In this paper, we present a general approach for universal IE from tables. Universal IE involves named entity recognition and relationship extraction among other tasks. To this end, we propose a new tree-like data structure, called `\Statement/', which can combine multiple (named) entities and (n-ary) relations (Fig. \ref{fig:knowledge_model}). It allows us to represent information in a homogeneous domain agnostic fashion. 
A \statement/ tree can contain content from different subjects, allowing for universal IE approach to tables across multiple domains.
With the introduction of \statements/, the IE problem from tables becomes a \textit{translation problem} which we call \emph{`statement extraction'} -- translating the original table into a set of statements. ESG reports, to this day, are manually analyzed by consultancy firms and professional organisations \citep{henisz_five_2019}. With our proposed statement extraction, this process can now be fully automated.

To evaluate our model generated statements, we propose a novel application of the well-established Tree Edit Distance \cite{pawlik_tree_2016}. We propose Tree Similarity Score ($t_s$) for measuring the similarity between two trees. 
As baseline,  we experiment with in-context learning using state-of-the-art LLMs like Mistral \cite{jiang_mistral_2023}, Mixtral \cite{Jiang2024MixtralOE}, Llama2 \cite{touvron_llama_2023}, and Falcon \cite{almazrouei2023falcon}. These models show an average $t_s$ varying from $0\%$ to $21\%$. On the other hand, our best-performing fine-tuned T5 based model shows a $t_s$ of $82\%$.Our main contributions are:
\begin{itemize}
\setlength\itemsep{-0.5 em}
\item We introduce a new knowledge model called \Statement/ for mapping complex, irregular, and heterogeneous information to a uniform domain agnositc structure.
\item We present a new supervised deep learning universal IE task called \emph{`statement extraction'}. The fine-tuned models show significant improvement over baseline experiments providing competitive benchmarks for the community.   
\item We contribute to the field of table understanding, by providing ``SemTabNet'' a dataset containing over 100K annotated ESG tables. All cells in these tables are annotated to reflect their semantic relationship with other cells.
\item We propose Tree Similarity Score, which in a single number quantifies the quality of entities and relationships extraction in the statement.
\end{itemize}

We begin, in Sect. \ref{sec:related_work} discussing related works. In Sect. \ref{sec:statements} we explain the concept of `\Statements/' and  
present the SemTabNet dataset in Sect. \ref{sec:data}. In sect. \ref{sec:experimentalresults}, we discuss the various experiments we performed and their results. We end the paper with an application of our model on ESG reports. 


\section{Related works}
\label{sec:related_work}

\citet{fang_large_2024} group the applications of deep learning methods to tables or tabular data into four broad categories. (1) Tree based methods such as gradient-boosted decision trees \citep{borisov_deep_2022} for predictions on tabular data. (2) Attention-based methods which includes developing models that learn tabular representations such as TAPAS \cite{herzig-etal-2020-tapas}, TABERT \cite{yin-etal-2020-tabert}, 
% TABBIE \cite{iida-etal-2021-tabbie}, TableFormer \cite{yang-etal-2022-tableformer}, 
and/or fine-tuning models for downstream tasks on tabular data like fact-checking  \cite[TABFACt]{2019TabFact}, question-answering \cite{liu_tapex_2021, mishra_esg_2024}, semantic parsing \cite{yu_grappa_2020}.
% , etc (see \cite{badaro_transformers_2023} for a comprehensive survey). 
(3) Regularization methods which attempts to modify model sensitivity to tabular features \cite{kadra_well-tuned_2021}. (4) Data transformation methods which aim at converting heterogeneous tabular inputs to homogeneous data, like an image \citep{sun_supertml_2019} or feature engineering \citep{Liu2020DNN2LRIF}. 

Another class of problem which is similar to the data transformation approach is (generative) information extraction (IE) which involves adopting LLMs to generate structural information from an information source.
Recent studies have found that LLMs can also perform universal IE \cite{kardas_axcell_2020, paolini_structured_2020, wang_deepstruct_2022, wang_instructuie_2023}. 

In a universal IE task, a model is trained to generate desirable structured information $y$, given a pre-defined schema $s$, and information source $x$ \cite{lu_unified_2022}. Using pre-trained language models, \citet{wang_ielm_2022} perform IE in two steps: argument extraction and predicate extraction. Based on this, they introduced a text-based open IE benchmark. \citet{wang_zero-shot_2021} presented DeepEx for extracting structured triplets from text based data. \citet{wang_deepstruct_2022} demonstrate that pre-training models on task-agnostic corpus lead to performance improvement on tasks like IE, entity recognition, etc. However, these approaches are limited to textual data.

\citet{bai_schema-driven_2024} have shown that LLMs can perform IE on tabular data when prompted with a table and a relevant extraction schema. Their approach is based on a human-in-the-loop in-context learning.  A domain-expert is necessary for producing robust extraction schema, which instructs the model to generate structured records from a table. This strongly limits the adaptability of their approach to different domains. Although limited to text, \cite{lu_unified_2022} also propose a schema-driven universal IE system. They use a structure extraction language which generates structural schema prompt which guides the model in its IE tasks. 

As we show, the \statements/ data structure removes several limitations of previous universal IE approaches and is applicable to `wild' heterogenous information sources.

\section{Definition of \Statements/}
\label{sec:statements}

The \statements/ data structure aims to homogenize data coming from complex, irregular, heterogeneous information source (text or tables). At its core, the \statements/ data structure is a tree structure (\cref{fig:knowledge_model}). From the root of the tree, we have `subject'-nodes, which contain information regarding the `subject' and the `subject-value' keys. From each subject-node, there are one or more predicate nodes, which define the `property', `property-value', and `unit' keys. Each predicate node carries an atomic piece of quantitative information.

The \statement/ knowledge model can be applied to both text and tables. In Fig. \ref{fig:text_table_to_statements}, we show the same \statements/ structure which could be obtained from a text or a corresponding table. As such, the \statements/ structure is not bound only to tables, however, it shows its usefulness particularly when normalising information from heterogeneous tables. The details of how we create trees are presented with examples in \cref{appendix:ted}.

The tree structure of \statements/ allows us to quantify, with a single number, the transformation of information from a table. This is accomplished by computing the Tree Similarity Score (based on the Tree Editing Distance (TED)~\citet{pawlik_tree_2016, schwarz_new_2017}) between predicted and ground-truth \statements/. TED is defined as the minimum-cost sequence of node operations that transform one tree into another. 
% Two trees are identical if their TED is 0 and maximally distinct if their normalized TED is 1. 
Like the Levenshtein distances on strings \citep{1966SPhD...10..707L}, TED involves three kinds of operations: node insertions, deletions, and renaming. The cost of each operation can be freely defined, which makes this metric both flexible and powerful. Two trees are exactly same when their tree similarity score is 100\%. To ensure high quality statement extraction, we setup robust TED costs such that minor differences can lead to poor tree similarity scores. In \cref{appendix:treesimilarityscore}, we demonstrate tree similarity score with some examples.

It is also instructive to look at the edit types which converted the predicted statements into ground-truth statements. For this, we measure the ratio of edit type to the total number of edits. 
We find that the ratio of insertions and ratio of deletions carries the information about the structural similarity of two trees. If the model predicted too few nodes, the ratio of insertions will be high. Correspondingly, if the statements from the model's prediction has too many nodes, the deletion ratio dominates. If two trees are structurally similar, then the ratio of both insertion and deletions is low. In this case, the edits are dominated by renaming.

While tree-based metrics are sensitive to both entity and relationship extraction, we also would like to understand the ability of a Statement Extraction Model (SEM) to extract entities alone \footnote{Here, `entity' refers to the values of attributes in a statement. For example, `scope 1 emissions' is an entity from the statement shown in \cref{fig:text_table_to_statements}.}. For this, we concatenate all the predicate nodes in a statement. We create sets of values corresponding to: subject, subject value, property, property value, unit. We count true positives when an entity is found in both the sets from model prediction and ground truth. True negatives are counted when an entity is present only in the ground truth set and false positives when the entity is present only in the predicted set. Based on these, we measure the standard accuracy, recall and F1 measures.

\section{SemTabNet: \Statements/ Data}
\label{sec:data}

\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{images/model_input_output.png}
    \caption{Input and output for the task of ``Statement Extraction''. \textit{Top Left:} Page from an ESG report containing tables. \textit{Top Right:} One of the table, from the same page, prepared as markdown for model input. \textit{Bottom Left:} Model output for the task of indirect statement extraction. \textit{Bottom Right:} Model output for the task of direct statement extraction.}
    \label{fig:model_input_output}
\end{figure*}

There are many large data sets of annotated tables which suffer from two major limitations: (1) they focus on understanding table structure only i.e. demarcating table headers from table content, and (2) contain little diversity in shape, size, and complexity of the table. Tables found in ESG reports are of high complexity with little common underlying pattern. In this work, we advance deep learning on table understanding by annotating the content of the table and annotating complex tables. 

We used the Deep Search toolkit~\footnote{Available via: \href{https://ds4sd.github.io}{https://ds4sd.github.io}.} to collect over 10K ESG reports from over 2000 corporations. Deep Search crawled these PDF reports, converted them into machine readable format, and provided this data along with the metadata of each report in json format. 
% This raw data included 324,037 tables. 

We compiled a list of important keywords which capture many important concepts in ESG reports (see \cref{appendix:esgkeywords}). Next, we select only those tables which have some relevance with the keywords. For this we used the following conditions: the ROUGE-L precision (longest common sub-sequence) score between raw data and keywords must be greater than 0.75 and there must be quantitative information in the table. 


We need a strategy for understanding the content of a table and extracting statements from it. After manually observing hundreds of table, we decided a two step approach to prepare our ground-truth data. First, we classify all the cells in a table based on the semantic meaning of their content into 16 categories which helps us in constructing statements. For each table, this step creates a `labels-table' with the same shape and structure as the original, but the cells of this labels-table only contain category labels (see \cref{fig:model_input_output}). Secondly, we create a program which reads both the labels-table and the original table and extracts statements in a rule-based approach. The algorithm is described in \cref{alg:statement_extraction}.
The 16 labels are:  
\begin{itemize}
    \setlength\itemsep{-0.5em}
    \item Property, Property Value
    \item Sub-property
    \item Subject, Subject Value 
    \item Unit, Unit Value
    \item Time, Time Value
    \item Key, Key Value
    \item Header 1, Header 2, Header 3
    \item Empty, Rubbish
\end{itemize}

During annotation, all cells are mapped to one of the above labels. For cells which contain information pertaining to more than one label, we pick the label which is higher in our ordered list of labels. So ``Revenue (US\$)'', is labelled as \texttt{property}. 
The `property' and `sub-property' cells always have associated `property value' cell(s). The `header' cells never have an associated value and often divide the table into smaller sections. Empty cells are labelled `empty'. When a table contain unnecessary parts due to faulty table recovery or non-quantitative information. We label such cells as `rubbish'. When a property/property value pair carries supplementary information, those cells are annotated as `key'/`key values'.

Additionally, we observed that most tables can be reasonably classified into three baskets: simple, complex, and qualitative. There are simple tables whose structure cannot be further subdivided into any smaller table. There are complex tables whose structure can be further divided into multiple smaller tables. Finally, there are qualitative tables (like table of contents) which contain little valuable information for our endeavour. 

We collected about 2,800 tables and found $\sim 20\%$ had simple layout, $\sim 20\%$ had complex layout (composed of multiple simpler tables arranged hierarchically), and $\sim 60\%$ were qualitative. We discarded all qualitative tables from any further analysis. To ensure that our data is not biased towards either simple or complex tables, we manually annotated all the cells of 569 simple tables and 538 complex tables. In total, we annotated 1,107 tables (84,890 individual cells) giving rise to 42,982 statements. 

Due to the nature of our strategy, one can extract statements from tables either directly in a zero shot manner (direct SE) or by predicting cell labels and then using the rule-based approach to construct statements (indirect SE) (see Fig. \ref{fig:model_input_output}. We have experimented with both approaches. 

We further augmented the annotated tables to create a large training data. We shuffle the rows and columns of tables corresponding to property-values to create new augmented tables, while keeping their contents the same. While this is straightforward for simple tables, special care was taken for complex tables such that only rows/columns which belonged together within a category were shuffled. The maximum number of augmented tables emerging from the shuffling operations was limited to 130, leading to over 120K tables. To promote further research and development, we open source this large dataset of semantic cell annotations as SemTabNet\footnote{Links for code and data, respectively:\\ \href{https://github.com/DS4SD/SemTabNet}{https://github.com/DS4SD/SemTabNet} \href{https://huggingface.co/datasets/ds4sd/SemTabNet}{https://huggingface.co/datasets/ds4sd/SemTabNet}}. Table \ref{tab:table_counts} shows the data counts in SemTabNet. 
\input{table_counts}

\section{Experiments \& Results}
\label{sec:experimentalresults}

Fig \ref{fig:model_input_output} presents Statement Extraction as a supervised deep learning task. Due to the nature of how tables are annotated (see \cref{sec:data}), it is possible to train models for statement extraction statements both directly and indirectly. We consider the following three seq2seq experiments: (1) \textit{SE Direct}: the model is presented with an input table as markdown in a prompt. The model generates the tabular representation of the resulting statements as markdown. (2) \textit{SE Indirect 1D}: In this experiment, the model input is the individual table cell contents. For a table with $n$ cells, we predict $n$ labels sequentially (hence, 1D) and then use this information to construct statements. Individual cell labels predicted by the model are stitched together to form the labels table, which is then used to construct the predicted statement by using our rule-based algorithm. (3) \textit{SE Indirect 2D}: As opposed to SE Indirect 1D, in this experiment, we predict the cell labels of all cells in a table simultaneously. The entire table, as markdown, is input to the model (hence 2D) and the model generates the labels table, as markdown. Using the rule-based algorithm, the predicted labels table is converted into predicted statements.

We use six special tokens, which allow us to control and parse model output.
\begin{itemize}
    \setlength\itemsep{-0.5em}
    \item Input table start token: \texttt{<table>}
    \item Input table stop token: \texttt{</table>}
    \item Output start token: \texttt{<response>}
    \item Output stop token: \texttt{</response>}
    \item Newline token: \texttt{<br>}
    \item Separate list item token: \texttt{<sep>}
\end{itemize}
This allows us to parse the predicted statements from a LLM. Once successfully parsed, the output statements can be trivially converted from one representation to another. This is crucial because we compare model predicted statements with ground truth by converting statements into a tree structure. These tokens are added to the tokenizer vocabulary before fine-tuning any model.


\input{table_result}

Since the nature of these tasks naturally fits the paradigm of sequence-to-sequence models, we fine-tune T5 models \cite{raffel_exploring_2020}. T5 models are encoder-decoder transformer architecture models which are suitable for many sequence-to-sequence tasks. In our experiments, we  train T5 variants (Small, Base, Large, and 3B) to create a family of Statement Extraction Models (SEM).

In our training data for tables, the input token count is less than 512 for $50\%$ of the data, and it is less than 1024 for $90\%$ of the data. Thus, except where mentioned, we train T5 models (small, base, large) with context windows of 512 and 1024, and T5-3b with context window of 512. All models are fine-tuned in a distributed data parallel (DDP) manner simultaneously across 4 GPU devices (Nvidia A100-40GB for T5-Small, T5-Base, T5-Large and NVIDIA A100-80GB for T5-3B). Additionally, the largest possible batch size was used for all models. The batch size is impacted by factors like model size, GPU memory, and context window. In turn it affects the number of epochs we can fine-tune in a reasonable time. 

For all tasks, we stop the fine-tuning process either after 500,000 steps or after 7 days. We use the AdamW optimizer with $\beta_1 = 0.9$ and $\beta_2 = 0.999$. All models are trained with a maximum learning rate of $5 \times 10^{-4}$. There is a warm-up phase of 1000 steps in which the learning rate increases linearly from $10^{-10}$ to $5 \times 10^{-4}$. After another 1000 steps, the learning rate is exponentially decayed until it reaches its lowest value of $10^{-6}$, where it remains until the end of the training. 

Table \ref{tab:table_result} presents the key results of our experiments. For each table, we evaluate the statements predicted by the model (directly or indirectly) against the ground truth statements. For each task and each model therein, we present the averaged tree similarity score ($t_s$) (measuring entity \& relationship extraction) and the averaged F1 score (measuring entity extraction). Also present are the averaged ratios of tree edit types, which helps us understand $t_s$. For all reported values, assuming a normal distribution, the standard error of the mean is below $5 \times 10^{-5}$ and the 99\% confidence interval for all values is about $\sim 0.1\%$.

\textbf{Baseline Experiments}:
For baseline experiments, several state of the art LLMs were tested for their in-context learning ability. In the prompt, we show the model an example of direct statement extraction (1-shot), followed by a test table. 
 
The models produce statements in markdown format, which are evaluated against ground truth statements. The average tree similarity score across 1100 annotated tables varies from $0\%$ for Falcon40b to $20\%$ for Mixtral (8$\times$7b models). For entity extraction, Llama2-13b performed the best with an average F1 score of 38. Not all outputs generated by the model were in correct markdown format. 
Minor changes in the prompt were found to create vast differences in the quality of extracted statements. In \cref{appendix:prompt}, we show examples of the prompt and the model output for some cases.

\textbf{Statement Extraction Indirect 1D}:
All models trained on this task have context window of 512. Their performance tends to scale with model size. 
These models can learn to extract entities, but relationship extraction is difficult. 
For SEM-T5-small, the ratio of insertion is $\approx 98\%$ which means that the predicted statements does not have enough nodes. 

\textbf{Statement Extraction Indirect 2D}:
All models trained on this task perform well on entity extraction with average F1 scores of over $95\%$. 
The highest performing model is the SEM-T5-3b (512) with an average tree similarity score of $81.76 \%$. 

\textbf{Statement Extraction Direct}:Based on tree similarity score, most models show poor performance in direct SE. The best performing model is SEM-T5-base with a context window of 1024. It gets an average F1 score of 76.99\% and an average tree similarity score of only 11\%. To understand, why these models performs so poorly on direct SE, we look at the ratio of tree edits. 

We note that the ratio of deletions for all models in this task is close to 0. On the other hand, the ratio of insertions for all models is high (from 88\% to 98\%). This suggests that the statement trees produced by these models is missing vast number of nodes compared to the ground truth. In fact, perusing the model output shows that while the output is of high quality, it contains significantly less nodes than ground truth statements.

\begin{figure*}[ht]
    \centering
    % \includegraphics[width=\linewidth]{images/counts_keywords_esg_2022.png}
    \includegraphics[width=0.32\linewidth]{images/counts_env.png}
    \includegraphics[width=0.32\linewidth]{images/counts_social.png}
    \includegraphics[width=0.32\linewidth]{images/counts_gov.png}
    \\    
    % \includegraphics[width=\linewidth]{images/property_frequency.png}
    \includegraphics[width=0.4\linewidth]{images/box-scope1.png} \hfill
    \includegraphics[width=0.4\linewidth]{images/box-scope2.png}
    \caption{Exploratory data analysis of \statements/ from over 2700 Tables published in ESG reports in 2022. \textit{Top:} We searched about 50,000 predicates using keywords (shown on the x-axis) related to environment (left), social (middle), and governance (right). The plot shows the distribution of predicates and the number of organizations from this search. \textit{Bottom:} Box plot for extracted Scope 1 and Scope 2 emission values grouped by business sectors from over 300 companies across multiple years. Only sectors with more than 20 data points are included.}
    \label{fig:esg}
\end{figure*}

\textbf{Discussion}: SE Indirect 1D shows good performance on entity extraction, but performs poorly for both entity and relationship extraction. In this task, the model only sees the content of one cell at a time which makes it easy to extract entities. However, this does not allow the model to develop a strong capability to learn tabular relationships. On the other hand, SE Direct, gives poor performance on both entity extraction and relationship extraction. Direct SE expects the models to unravel a dense table into \statements/, for which they must produce many output tokens. For example, the average number of output tokens in the test data for SE direct is $5773 \pm 51$, which is significantly larger than the number of tokens for SE indirect 2D ($346 \pm 1$). Thus, direct SE is a very challenging task and might require different strategies to be executed successfully. 

SE Indirect 2D, avoids the disadvantages of both the tasks. In this case, the model sees the entire input table (has the chance to learn tabular relationships) and is only tasked with producing a labels table (can finish generation in a reasonable number of tokens). Our experiments clearly demonstrate that statement extraction via the Indirect 2D approach gives better results. This is an unexpected finding of our study, and we hope it motivates other researchers to improve zero-shot statement extraction capability.

\section{Application to ESG results}
\label{sec:application}


Due to their homogeneous structure, statements enable large-scale exploratory data analysis and data science. To demonstrate the advantage of statements over traditional tabular data science, we applied SEM-T5-large (512 SE Indirect 2D) over 2700 tables published in over 1000 ESG reports in 2022. This lead to 14,766 statements containing over 100k predicates. This dataset containing ESG related KPIs is invaluable to researchers, policy-makers, and analysts. 

We filter this large dataset to contain only those predicates with quantitative property values. This subset contains 47\,901 predicates from 601 corporate ESG reports. We search the properties in this dataset for some keywords representative of ESG KPIs. Fig. \ref{fig:esg} (top) shows the distribution of the number of predicates and the number of distinct organizations which matched our simple keyword search. For example, using `emission' as a keyword, we obtain over 4000 hits with results coming from over 300 distinct corporations. 

Fig. \ref{fig:esg} (bottoms) shows the total scope 1 emissions (left) and total scope 2 emission (right). Each box shows the distribution of emission from multiple corporations across sectors ($\sim20$ in Healthcare to $\sim100$ in Technology and Industrial Goods) containing data from several years. The data reported in the original report contained emissions in different units, which were harmonized for creating this plot. 

Since we only took a small subset of 1000 reports for this analysis, our data is incomplete and is only representative. The \statements/ dataset allows one to study how emissions from individual companies or across sectors have evolved over time. This dataset can also serve as a starting point for many other downstream applications like question-answering, fact checking, table retrieval, etc. 


\section{Conclusion \& Future Works}
\label{sec:conclusions}

We have presented a novel approach to map complex, irregular, and heterogeneous information to a uniform structure, \Statements/. We presented Statement Extraction which is a new supervised deep-learning information extraction task. We contribute the field of table understanding by open-sourcing SemTabNet consisting of 100K ESG tables wherein all cells. 

Investigating three variations of the statement extraction task, we found that using a model to generate table annotations and then construct \statements/ produces best results. This approach has the advantage, that it produces homogeneous structured data with reduced hallucinations. \Statements/ are an advantageous vehicle for quantitative factual information. They enable down-stream tasks like data science over a large collection of documents. We extracted over 100K facts (predicates) from only 1000 ESG reports.

This work can be easily extended to include domains other than ESG. It can also be extended towards multi-modality by including text data. We leave for future exploration, the use of statements in downstream tasks like QA or document summarization.


\section*{Limitations}
\label{sec:limitations}
% \textbf{Authors are required to discuss the limitations of their work in a dedicated section titled “Limitations”. This section should be included at the end of the paper, before the references, and it will not count toward the page limit. This includes both, long and short papers. Note, prior to the December 2023 cycle, this was optional.}

% %- discuss the selection of ESG domain from climate change
Although, the ideas and the techniques we describe in this paper are domain agnostic, we limit the scope of this paper to the domain of corporate Environment, Social, and Governance (ESG) reports. This choice is motivated by two observations. First, corporations report valuable quantitative data regarding their efforts to improve their carbon emissions, working conditions, and company culture in ESG reports. These reports contain valuable information regarding the environmental impact of businesses, and the urgency of climate change motivates us to target this domain. Secondly, there is a large variety and diversity of tabular representations used in these reports. Despite efforts to standardize these reports, this diversity makes the task of extracting information from these documents extremely challenging, motivating our choice. 

% We apply our model to a large corpus of ESG reports and analyze the statements. In Sect. \cref{sec:discussion}, we present an overall discussion of our work and the analysis of the ESG Statements.
The scope of this work is limited to declarative, explicit knowledge only. All other kinds of knowledge such as cultural, implicit, conceptual, tacit, procedural, conditional, etc. are ignored. We focus on information which one colloquially refers to as `hard facts'. Additionally, we limit the scope of this work to quantitative statements i.e. statements whose property values are numerical quantities. We implement this restriction in the notion that we avoid qualitative statements i.e. statements which are not quantitative.

Our model training strategy was biased against large models. We trained all models for either 500K steps or 7 days using the largest possible batch size. This means smaller models learn more frequently (more epochs) than larger models. However, we do not believe this severely impacted the outcome of our experiments. Our resources were enough to recover well-known trends: improved model performance with model size and context-length.

\bibliography{main}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix

\input{esg_keywords}
\onecolumn
\input{statement_examples}
\input{ted}
\input{prompt}
\input{algorithm}
\input{complex_table}
\end{document}
