%%%%%%%% ICML 2023 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%% % Todo List: % 1. Why not VL-CoT % 2. Since you claim multimodal-CoT: why not CoT rationale (not input) in the form both text and images % 3. Comparison with GPT-3.5 is unfair \documentclass[nohyperref]{article} % \usepackage{algorithm} \usepackage{algpseudocode} \renewcommand{\algorithmicrequire}{\textbf{Input:}} \renewcommand{\algorithmicensure}{\textbf{Output:}} % Recommended, but optional, packages for figures and better typesetting: \usepackage{microtype} \usepackage{graphicx} \usepackage{subfigure} \usepackage{booktabs} % for professional tables \newcommand{\fix}[1]{\textcolor{black}{#1}} \usepackage{pgfplots} \usetikzlibrary{pgfplots.groupplots} \pgfplotsset{compat=1.3} \usepackage{tikz} \usetikzlibrary{patterns} \usepackage{pgf-pie} \usepackage{pifont} \newcommand{\okmark}{{\textbf{\textcolor[rgb]{0.1, 0.5, 0.1}{$\checkmark$}}}} \newcommand{\ngmark}{{\textbf{\color{red}{\ding{55}}}}} \definecolor{battleshipgrey}{rgb}{0.3, 0.3, 0.3} \definecolor{brilliantrose}{rgb}{1.0, 0.33, 0.64} \definecolor{americanrose}{rgb}{1.0, 0.01, 0.24} \definecolor{jweigreen}{rgb}{0,0.45,0.24} \definecolor{bluegray}{rgb}{0.1, 0.1, 0.4} \definecolor{ao(english)}{rgb}{0.0, 0.5, 0.0} \definecolor{blanchedalmond}{rgb}{1.0, 0.92, 0.8} \definecolor{atomictangerine}{rgb}{1.0, 0.6, 0.4} \definecolor{chocolate(web)}{rgb}{0.82, 0.41, 0.12} \definecolor{bananayellow}{rgb}{1.0, 0.88, 0.21} \definecolor{goldenbrown}{rgb}{0.6, 0.4, 0.08} \definecolor{aliceblue}{rgb}{0.94, 0.97, 1.0} \definecolor{beige}{rgb}{0.96, 0.96, 0.86} \definecolor{babyblue}{rgb}{0.54, 0.81, 0.94} \definecolor{camel}{rgb}{0.76, 0.6, 0.42} \definecolor{cinnamon}{rgb}{0.82, 0.41, 0.12} \newcommand{\battleshipgrey}[1]{{\color{battleshipgrey}{#1}}} \newcommand{\americanrose}[1]{{\color{americanrose}{#1}}} \newcommand{\jweigreen}[1]{{\color{jweigreen}{#1}}} \newcommand{\darkgreen}[1]{{\color{ao(english)}{#1}}} \newcommand{\aliceblue}[1]{{\color{aliceblue}{#1}}} \newcommand{\beige}[1]{{\color{beige}{#1}}} \newcommand{\babyblue}[1]{{\color{babyblue}{#1}}} \newcommand{\camel}[1]{{\color{camel}{#1}}} \newcommand{\cinnamon}[1]{{\color{cinnamon}{#1}}} \newcommand\tikzmark[2]{% \tikz[remember picture,baseline] \node[above, outer sep=0pt, inner sep=0pt] (#1){\phantom{#2}};% } % hyperref makes hyperlinks in the resulting PDF. % If your build breaks (sometimes temporarily if a hyperlink spans a page) % please comment out the following usepackage line and replace % \usepackage{icml2022} with \usepackage[nohyperref]{icml2022} above. \usepackage{hyperref} % \hyphenpenalty=2000 % Attempt to make hyperref and algorithmic work together better: \newcommand{\theHalgorithm}{\arabic{algorithm}} % Use the following line for the initial blind version submitted for review: %\usepackage{icml2023} % If accepted, instead use the following line for the camera-ready submission: \usepackage[accepted]{icml2023} % For theorems and such \usepackage{amsmath} \usepackage{amssymb} \usepackage{mathtools} \usepackage{amsthm} \usepackage{multirow} \usepackage{pgfplots} % if you use cleveref.. \usepackage[capitalize,noabbrev]{cleveref} % \usepackage{caption} % \captionsetup{justification=centering} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % THEOREMS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \theoremstyle{plain} \newtheorem{theorem}{Theorem}[section] \newtheorem{proposition}[theorem]{Proposition} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{corollary}[theorem]{Corollary} \theoremstyle{definition} \newtheorem{definition}[theorem]{Definition} \newtheorem{assumption}[theorem]{Assumption} \theoremstyle{remark} \newtheorem{remark}[theorem]{Remark} % Todonotes is useful during development; simply uncomment the next line % and comment out the line below the next line to turn off comments %\usepackage[disable,textsize=tiny]{todonotes} \usepackage[textsize=tiny]{todonotes} % The \icmltitle you define below is probably too long as a header. % Therefore, a short form for the running title is supplied here: \icmltitlerunning{Multimodal Chain-of-Thought Reasoning in Language Models} \begin{document} \twocolumn[ \icmltitle{Multimodal Chain-of-Thought Reasoning in Language Models} % It is OKAY to include author information, even for blind % submissions: the style file will automatically remove it for you % unless you've provided the [accepted] option to the icml2022 % package. % List of affiliations: The first argument should be a (short) % identifier you will use later to specify author affiliations % Academic affiliations should list Department, University, City, Region, Country % Industry affiliations should list Company, City, Region, Country % You can specify symbols, otherwise they are numbered in order. % Ideally, you should not use this facility. Affiliations will be numbered % in order of appearance and this is the preferred way. \icmlsetsymbol{equal}{*} \begin{icmlauthorlist} \icmlauthor{Zhuosheng Zhang}{s} \icmlauthor{Aston Zhang}{a} \icmlauthor{Mu Li}{a} \icmlauthor{Hai Zhao}{s} \icmlauthor{George Karypis}{a} \icmlauthor{Alex Smola}{a} %\icmlauthor{}{sch} %\icmlauthor{}{sch} \end{icmlauthorlist} \icmlaffiliation{s}{Shanghai Jiao Tong University} \icmlaffiliation{a}{Amazon Web Services} \icmlcorrespondingauthor{Zhuosheng Zhang (work done at Amazon Web Services)}{zhangzs@sjtu.edu.cn} \icmlcorrespondingauthor{Aston Zhang}{az@astonzhang.com} % You may provide any keywords that you % find helpful for describing your paper; these are used to populate % the "keywords" metadata in the PDF but will not be shown in the document \icmlkeywords{Machine Learning, ICML} \vskip 0.3in ] % this must go after the closing bracket ] following \twocolumn[ ... % This command actually creates the footnote in the first column % listing the affiliations and the copyright notice. % The command takes one argument, which is text to display at the start of the footnote. % The \icmlEqualContribution command is standard text for equal contribution. % Remove it (just {}) if you do not need this facility. \printAffiliationsAndNotice{} % leave blank if no need to mention equal contribution % \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text. \begin{abstract} Large language models (LLMs) have shown impressive performance on complex reasoning by leveraging chain-of-thought (CoT) prompting to generate intermediate reasoning chains as the rationale to infer the answer. However, existing CoT studies have focused on the language modality. {We propose Multimodal-CoT that incorporates language (text) and vision (images) modalities into a two-stage framework that separates rationale generation and answer inference. In this way, answer inference can leverage better generated rationales that are based on multimodal information.} % By incorporating the vision features % in both stages, the model is able to produce effective % rationales that contribute to answer inference. With Multimodal-CoT, our model under 1 billion parameters outperforms the previous state-of-the-art LLM (GPT-3.5) by 16 percentage points (75.17\%$\rightarrow$91.68\% accuracy) and even surpasses human performance on the ScienceQA benchmark. Code is publicly available.\footnote{\url{https://github.com/amazon-science/mm-cot}} % a simple solution is transforming different modalities into language modality and pompting LLMs to perform CoT. However, this approach has two major limitations. Firstly, public LLMs models like GPT-3 have paywalls; thus are costly to access. Secondly, this solution is sub-optimal as there is severe information loss during the modality transformation. To facilitate the interaction between modalities, we focus on the other solution by fine-tuning small language models. % there are two key limitations in existing studies: (i) CoT is mainly studied in the language modality, with little consideration of multimodal scenarios; (ii) CoT yields better performance mainly on LLMs with over 100 billion parameters. To elicit CoT reasoning in multimodality with 1B-models, we propose Multimodal-CoT to incorporate visual signals in a decoupled training framework. Our framework separates the rationale generation and answer inference into two stages. We find that using vision features helps generate more effective rationales that contribute to more accurate answer inference. With Multimodal-CoT, our model with less than 1 billion parameters outperforms the previous state-of-the-art GPT-3.5 by 17\% on the ScienceQA benchmark and even surpasses human performance. Code is available at \texttt{Anonymous}. \end{abstract} \vspace{-3.8mm} \section{Introduction} \label{intro} Imagine reading a textbook with no figures or tables. Our ability to knowledge acquisition is greatly strengthened by jointly modeling diverse data modalities, such as vision, language, and audio. Recently, large language models (LLMs) \citep{brown2020language, lamda, gopher, palm} have shown impressive performance in complex reasoning by generating intermediate reasoning steps before inferring the answer. The intriguing technique is called chain-of-thought (CoT) reasoning \citep{cot_wei,kojima2022large,zhang2022automatic}. % The CoT reasoning process is defined in Definition \ref{def:cot}. % \begin{definition} % \label{def:cot} % \textbf{(CoT Reasoning)} Given an input $X$, CoT reasoning extends the traditional input-output mapping \{$X{\rightarrow}Y$\} with a series of intermediate reasoning steps \{$X{\rightarrow}R{\rightarrow}Y$\} where $R$ denotes the reasoning steps before producing the answer.\footnote{We use the term ``reasoning" following previous CoT papers \citep{cot_wei,kojima2022large}. Although it remains an open question whether LLMs really perform reasoning, we define CoT reasoning under the hypothesis that the rationale is used as intermediate steps before arriving at the answer.} % \end{definition} However, existing studies related to CoT reasoning are largely isolated in the language modality \citep{wang2022rationale,zhou2022least,lu2022dynamic,fu2022complexity}, with little consideration of multimodal scenarios. To elicit CoT reasoning in multimodality, we advocate a Multimodal-CoT paradigm. Given the inputs in different modalities, Multimodal-CoT decomposes multi-step problems into intermediate reasoning steps (rationale) and then infers the answer. Since vision and language are the most popular modalities, we focus on those two modalities in this work. An example is shown in Figure \ref{fig_examples}. In general, there are two ways to elicit Multimodal-CoT reasoning as follows: (i) prompting LLMs and (ii) fine-tuning small models.\footnote{In this work, we refer to small models as models with less than 1 billion parameters (hereinafter dubbed as 1B-models).} \begin{figure}[t] \begin{center} \includegraphics[width=1\columnwidth]{figures/fig-example.pdf} \end{center} \vspace{-3.6mm} \caption{Example of the multimodal CoT task.} \vspace{-7mm} \label{fig_examples} \end{figure} The most immediate way to perform Multimodal-CoT is to transform the input of different modalities into one modality and prompt LLMs to perform CoT. {For example}, it is possible to extract the caption of an image by a captioning model and then concatenate the caption with the original language input to be fed into LLMs \citep{lu2022learn}. However, {there is severe information loss in the captioning process; thus, using the captions (as opposed to vision features) may suffer from a lack of mutual synergy in the representation space of different modalities.} \begin{table*}[htb] \centering % \fontsize{9pt}{\baselineskip}\selectfont % font size \vspace{-3mm} \renewcommand\tabcolsep{4.2pt} % column space % \renewcommand\arraystretch{0.75} % line space \small \caption{Typical CoT techniques (FT: fine-tuning; KD: knowledge distillation). Segment 1: in-context learning techniques; Segment 2: fine-tuning techniques. To the best of our knowledge, our work is the first to study CoT reasoning in different modalities. Besides, we focus on 1B-models, without relying on the outputs of LLMs. \label{tab:cot_methods} } \begin{tabular}{lcccccc} \toprule \textbf{Models} & \textbf{Mutimodal} & \textbf{w/o LLM} & \textbf{Model / Engine} & \textbf{Training} & \textbf{CoT Role} & \textbf{CoT Source} \\ \midrule Zero-Shot-CoT~\citep{kojima2022large} & \ngmark & \ngmark & GPT-3.5 (175B) & ICL & Reasoning & Template \\ Few-Shot-CoT~\citep{cot_wei} & \ngmark & \ngmark & PaLM (540B) & ICL & Reasoning & Hand-crafted \\ Self-Consistency-CoT~\citep{cot_wei_sc} & \ngmark & \ngmark & Codex (175B) & ICL & Reasoning& Hand-crafted \\ Least-to-Most Prompting~\citep{zhou2022least}& \ngmark& \ngmark & Codex (175B) & ICL & Reasoning & Hand-crafted \\ Retrieval-CoT~\citep{zhang2022automatic} & \ngmark& \ngmark & GPT-3.5 (175B) & ICL & Reasoning & Auto-generated \\ PromptPG-CoT~\citep{lu2022dynamic} & \ngmark & \ngmark & GPT-3.5 (175B) & ICL & Reasoning & Hand-crafted \\ Auto-CoT~\citep{zhang2022automatic} & \ngmark & \ngmark & Codex (175B) & ICL & Reasoning & Auto-generated \\ Complexity-CoT~\citep{fu2022complexity}& \ngmark & \ngmark & GPT-3.5 (175B) & ICL & Reasoning & Hand-crafted \\ Few-Shot-PoT~\citep{chen2022program} & \ngmark & \ngmark & GPT-3.5 (175B) & ICL & Reasoning & Hand-crafted \\ \midrule UnifiedQA~\citep{lu2022learn}& \ngmark & \okmark & T5 (770M)& FT & Explanation & Crawled\\ Fine-Tuned T5 XXL~\citep{magister2022teaching} & \ngmark & \ngmark & T5 (11B)& KD & Reasoning & LLM-generated \\ Fine-Tune-CoT \citep{ho2022large} & \ngmark & \ngmark & GPT-3 (6.7B) & KD & Reasoning & LLM-generated \\ Multimodal-CoT (our work) & \okmark &\okmark & T5 (770M)& FT & Reasoning & Crawled \\ \bottomrule \end{tabular} \vspace{-5mm} \end{table*} {To facilitate the interaction between modalities, another potential solution is to fine-tune smaller language models (LMs) by fusing multimodal features \citep{zhang2023universal}. As this approach allows the flexibility of adjusting model architectures to incorporate multimodal features, we study fine-tuning models in this work instead of prompting LLMs. The key challenge is that language models under 100 billion parameters tend to generate hallucinated rationales that mislead the answer inference \citep{ho2022large,magister2022teaching,ji2022survey}.} % it is helpful to simplify the reasoning process and provide fine-grained interactions between modalities to generate effective rationales. In this work, {To mitigate the challenge of hallucination, we propose Multimodal-CoT that incorporates language (text) and vision (images) modalities into a two-stage framework that separates rationale generation and answer inference. In this way, answer inference can leverage better generated rationales that are based on multimodal information.} Our experiments are conducted on the ScienceQA benchmark \citep{lu2022learn}, which is the latest multimodal reasoning benchmark with annotated reasoning chains. Experimental results show that our method surpasses the previous state-of-the-art GPT-3.5 model by +16\% (75.17\%$\rightarrow$91.68\%) on the benchmark. Our contributions are summarized as follows: % Our analysis shows the key is to leverage vision features to generate informative rationales that help infer the final answer. (i) {To the best of our knowledge, this work is the first to study CoT reasoning in different modalities.} (ii) We propose a two-stage framework by fine-tuning language models to fuse vision and language representations to perform Multimodal-CoT. The model is able to generate informative rationales to facilitate inferring final answers. (iii) Our method achieves new state-of-the-art performance on the ScienceQA benchmark, outperforming accuracy of GPT-3.5 by 16\% and even surpassing human performance. % There are two key limitations in current paradigms of levering LLMs with CoT prompting. On the one hand, vision features are unexplored for CoT reasoning. On the other hand, public models like GPT-3 have paywalls, and the model details are not easily accessible. % To overcome the limitations, we advocate a Multimodal-CoT paradigm. Given the inputs in different modalities [$X\dots, X'$], Multimodal-CoT decomposes multi-step problems into intermediate steps and then infers the answers \{[$X, \dots, X']{\rightarrow}R{\rightarrow}Y$\}. An language-vision example from ScienceQA \citep{lu2022learn} is shown in Figure \ref{fig_examples}. % Concretely, we incorporate visual signals for chain-of-thought reasoning and ensure the approach is easily accessible---e.g., with 32G GPU memory. % \begin{definition} % \label{def:mm} % \textbf{(Multimodal-CoT)} Given the inputs in different modalities [$X\dots, X'$], Multimodal-CoT decomposes multi-step problems into intermediate steps and then infers the answers \{[$X, \dots, X']{\rightarrow}R{\rightarrow}Y$\}. An example from ScienceQA \citep{lu2022learn} is shown in Figure \ref{fig_examples}. % \end{definition} % Based on the observations above, % we propose Multimodal-CoT to incorporate visual signals % in a decoupled training framework. \section{Background} This section reviews recent progress of eliciting CoT reasoning by prompting and fine-tuning language models. \subsection{CoT Reasoning with LLMs} Recently, CoT has been widely used to elicit the multi-step reasoning abilities of LLMs \citep{cot_wei}. Concretely, CoT techniques encourage the LLM to generate intermediate reasoning chains for solving a problem. Studies have shown that LLMs can perform CoT reasoning with two major paradigms of techniques: Zero-Shot-CoT \citep{kojima2022large} and Few-Shot-CoT \citep{cot_wei,zhang2022automatic}. For Zero-Shot-CoT, \citet{kojima2022large} showed that LLMs are decent zero-shot reasoners by adding a prompt like ``Let’s think step by step" after the test question to invoke CoT reasoning. For Few-Shot-CoT, a few step-by-step reasoning demonstrations are used as conditions for inference. Each demonstration has a question and a reasoning chain that leads to the final answer. The demonstrations are commonly obtained by hand-crafting or automatic generation. The corresponding techniques are thus referred to as Manual-CoT \citep{cot_wei} and Auto-CoT \citep{zhang2022automatic}. With effective demonstrations, Few-Shot-CoT often achieves stronger performance than Zero-Shot-CoT and has attracted more research interest. Therefore, most recent studies focused on how to improve Few-Shot-CoT. Those studies are categorized into two major research lines: (i) optimizing the demonstrations; (ii) optimizing the reasoning chains. Table \ref{tab:cot_methods} compares typical CoT techniques. \paragraph{Optimizing Demonstrations} The performance of Few-Shot-CoT relies on the quality of demonstrations. As reported in \citet{cot_wei}, using demonstrations written by different annotators results in dramatic accuracy disparity in a symbolic reasoning task. Beyond hand-crafting the demonstrations, recent studies have investigated ways to optimize the demonstration selection process. Notably, \citet{rubin2021learning} retrieved the semantically similar demonstrations with the test instance. However, this approach shows a degraded performance when there are mistakes in the reasoning chains \citep{zhang2022automatic}. To address the limitation, \citet{zhang2022automatic} found that the key is the diversity of demonstration questions and proposed Auto-CoT: (i) partition questions of a given dataset into a few clusters; (ii) sample a representative question from each cluster and generate its reasoning chain using Zero-Shot-CoT with simple heuristics. In addition, reinforcement learning (RL) and complexity-based selection strategies were also proposed to obtain effective demonstrations. \citet{fu2022complexity} chose examples with complex reasoning chains (i.e., with more reasoning steps) as the demonstrations. \citet{lu2022dynamic} trained an agent to find optimal in-context examples from a candidate pool and maximize the prediction rewards on given training examples when interacting with GPT-3.5. \paragraph{Optimizing Reasoning Chains} A notable way to optimize reasoning chains is problem decomposition. \citet{zhou2022least} proposed least-to-most prompting to decompose complex problems into sub-problems and then solve these sub-problems sequentially. As a result, solving a given sub-problem is facilitated by the answers to previously solved sub-problems. Similarly, \citet{khot2022decomposed} used diverse decomposition structures and designed different prompts to answer each sub-question. In addition to prompting the reasoning chains as natural language texts, \citet{chen2022program} proposed program-of-thoughts (PoT), which modeled the reasoning process as a program and prompted LLMs to derive the answer by executing the generated programs. Another trend is to vote over multiple reasoning paths for a test question. \citet{cot_wei_sc} introduced a self-consistency decoding strategy to sample multiple outputs of LLMs and then took a majority over the final answers. \citet{wang2022rationale} and \citet{li2022advance} introduced randomness in the input space to produce more diverse outputs for voting. \subsection{Eliciting CoT Reasoning by Fine-Tuning Models} % Existing research suggests that CoT reasoning ability may emerge in LLMs. A recent interest is eliciting CoT reasoning by fine-tuning language models. \citet{lu2022learn} fine-tuned the encoder-decoder T5 model on a large-scale dataset with CoT annotations. However, a dramatic performance decline is observed when using CoT to infer the answer, i.e., generating the reasoning chain before the answer (reasoning). Instead, CoT is only used as an explanation after the answer. \citet{magister2022teaching} and \citet{ho2022large} employed knowledge distillation by fine-tuning a student model on the chain-of-thought outputs generated by a larger teacher model. The proposed methods showed performance gains in arithmetic, commonsense, and symbolic reasoning tasks. {There is a key challenge in training 1B-models to be CoT reasoners.} As observed by \citet{cot_wei}, models under 100 billion parameters tend to produce illogical CoT that leads to wrong answers. In other words, it might be harder for 1B-models to generate effective CoT than directly generating the answer. It becomes even more challenging in a multimodal setting where answering the question also requires understanding the multimodal inputs. In the following part, we will explore the challenge of Multimodal-CoT and investigate how to perform effective multi-step reasoning. \section{Challenge of Multimodal-CoT}\label{sec:prelim} Existing studies have suggested that the CoT reasoning ability may emerge in language models at a certain scale, e.g., over 100 billion parameters \citep{wei2022emergent}. However, it remains an unresolved challenge to elicit such reasoning abilities in 1B-models, let alone in the multimodal scenario. {This work focuses on 1B-models as they can be fine-tuned and deployed with consumer-grade GPUs (e.g., 32G memory).} In this section, we will investigate why 1B-models fail at CoT reasoning and study how to design an effective approach to overcome the challenge. % By decomposing the reasoning problems into intermediate steps with the CoT technique, LLMs have shown impressive performance on arithmetic and symbolic reasoning tasks. However, existing studies focus on LLMs, % the reasoning capabilities % of smaller LMs do not improve with CoT prompting, mostly producing illogical CoT (Wei et al., % 2022). Notably, CoT prompting even reduces the % accuracy of models with less than 10 billion parameters % using demonstrations written by different annotators brings up to % 28.2% accuracy disparity in a symbolic reasoning task, while changing the order of demonstrations % results in less than 2% changes in most tasks \subsection{Towards the Role of CoT} To begin with, we fine-tune a text-only baseline for CoT reasoning on the ScienceQA benchmark \citep{lu2022learn}. Following \citet{lu2022learn}, we adopt UnifiedQA$_\texttt{Base}$ \citep{khashabi2020unifiedqa} as the backbone language model.\footnote{{UnifiedQA \citep{khashabi2020unifiedqa} is adopted as it is the best fine-tuning model in \citet{lu2022learn}. Model information and implementation details are presented in Appendix \ref{app-sec:baseline}.}} Our task is modeled as a text generation problem, where the model takes the textual information as the input and generates the output sequence that consists of the rationale and the answer. As an example shown in Figure \ref{fig_examples}, the model takes the concatenation of tokens of the question text (Q), the context text (C), and multiple options (M) as the input. To study the effect of CoT, we compare the performance with three variants: (i) No-CoT which predicts the answer directly (QCM$\rightarrow$A); (ii) Reasoning where answer inference is conditioned to the rationale (QCM$\rightarrow$RA); (iii) Explanation where the rationale is used for explaining the answer inference (QCM$\rightarrow$AR). % we experiment with three output formats: (i) answer only (QCM$\rightarrow$A); (ii) rationale before the answer (QCM$\rightarrow$RA); (iii) rationale after the answer (QCM$\rightarrow$AR). The three formats correspond to different roles of CoT, i.e., (i) No CoT; (ii) Reasoning (answer inference is conditioned to the rationale); (iii) Explanation (rationale is used for explaining the answer inference). \begin{table}[htb] \centering\small \vspace{-3.6mm} \caption{Effects of CoT in {the one-stage setting}.\label{tab:pre_position}} \setlength{\tabcolsep}{12pt} { \begin{tabular}{llc}\toprule {Method} & {Format} & {Accuracy} \\\midrule No-CoT & QCM$\rightarrow$A & 80.40 \\ \midrule Reasoning & QCM$\rightarrow$RA & 67.86\\ Explanation & QCM$\rightarrow$AR & 69.77\\ \bottomrule \end{tabular} \vspace{-1.8mm} } \end{table} \begin{figure*}[htb] \begin{center} \includegraphics[width=1\textwidth]{figures/fig-pre-case1.pdf} \end{center} \vspace{-3mm} \caption{Example of the two-stage framework without vision features (baseline) and with vision features (ours) for generating rationales and predicting answers. The upper part presents the problem details with a gold rationale, and the lower part shows the outputs of the baseline and our method incorporated with vision features. We observe that the baseline fails to predict the right answer due to the misleading by hallucinated rationales. More examples are shown in Appendix \ref{appendix:misleading}.} \vspace{-3mm} \label{fig_pre_case1} \end{figure*} Surprisingly, we observe a $\downarrow$12.54\% accuracy decrease (80.40\%$\rightarrow$67.86\%) if the model predicts rationales before answers (QCM$\rightarrow$RA). The results imply that the rationales might not necessarily contribute to predicting the right answer. A similar phenomenon was observed in \citet{lu2022learn}, where the plausible reason might be that the model exceeds the maximum token limits before obtaining the required answer or stops generating the prediction early. However, we find that the maximum length of the generated outputs (RA) is always less than 400 tokens, which is below the length limit of language models (i.e., 512 in UnifiedQA$_\texttt{Base}$). % and the ratio of missing answers is only 5\%. Therefore, it deserves a more in-depth investigation into why the rationales harm answer inference. \subsection{Misleading by Hallucinated Rationales}\label{sec:misleading} To dive into how the rationales affect the answer prediction, we separate the CoT problem into two stages, \textit{rationale generation} and \textit{answer inference}. We report the RougeL score and accuracy for the rationale generation and answer inference, respectively. Table \ref{tab:pre_decoupled} shows the results based on the two-stage framework. Although the two-stage baseline model achieves a 91.76 RougeL score of the rationale generation, the answer inference accuracy is only 70.53\%. Compared with the QCM$\rightarrow$A variant (80.40\%) in Table \ref{tab:pre_position}, the result shows that the generated rationale in the two-stage framework does not improve answer accuracy. \begin{table}[htb] \vspace{-3.6mm} \centering\small \caption{{Two-stage} setting of (i) rationale generation (RougeL) and (ii) answer inference (Accuracy). \label{tab:pre_decoupled}} % \setlength{\tabcolsep}{0.8pt} \begin{tabular}{lcc}\toprule {Method} & {(i) QCM$ \rightarrow$ R} & {(ii) QCMR$ \rightarrow$ A} \\\midrule Two-Stage Framework & 91.76 & 70.53 \\ \midrule \quad w/ Captions & 91.85 & 71.12 \\ \quad w/ Vision Features & 96.97 & 84.91 \\ \bottomrule \end{tabular} \end{table} Then, we randomly sample 50 error cases and find that the model tends to generate hallucinated rationales that mislead the answer inference. As an example shown in Figure \ref{fig_pre_case1}, the model (left part) hallucinates that, ``\textit{The south pole of one magnet is closest to the south pole of the other magnet}", due to the lack of reference to the vision content. % Although the model generates decent rationales with decent textual overlap with the gold ones, they are less informative and fail at the key steps to infer the answer. We find that such mistakes occur at a ratio of 64\% among the error cases (Figure \ref{fig_bar}(a)). % Based on comprehensive error analysis of the baseline, we find that the generated rationales imitate the formats with the gold ones, as shown in Figure \ref{fig_pre_case1} (left part). Although the model generates decent rationales, they are less informative and fail at the key steps to infer the answer. \begin{figure}[t] \begin{center} \includegraphics[width=1\columnwidth]{figures/fig-error_bar.pdf} \end{center} \vspace{-3.6mm} \caption{The ratio of hallucination mistakes (a) and correction rate w/ vision features (b).} \vspace{-3mm} \label{fig_bar} \end{figure} \begin{figure*}[htb] \begin{center} \includegraphics[width=1\textwidth]{figures/fig-overview.pdf} \end{center} \vspace{-3mm} \caption{Overview of our Multimodal-CoT framework. Multimodal-CoT consists of two stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output. In the first stage, we feed the model with language and vision inputs to generate rationales. In the second stage, we append the original language input with the rationale generated from the first stage. Then, we feed the updated language input with the original vision input to the model to infer the answer.} \label{fig_overview} \vspace{-3mm} \end{figure*} \subsection{Multimodality Contributes to Effective Rationales}\label{sec:multimodal} We speculate that such a phenomenon of hallucination is due to a lack of necessary vision contexts for performing effective Multimodal-CoT. To inject vision information, a simple way is to transform the paired image into a caption \citep{lu2022learn} and then append the caption in the input of both stages. However, as shown in Table \ref{tab:pre_decoupled}, using captions only yields marginal performance gains ($\uparrow$0.59\%). Then, we explore an advanced technique by incorporating vision features into the language model. Concretely, we feed the paired image to the DETR model \citep{carion2020end} to extract vision features. Then we fuse the vision features with the encoded language representations before feeding to the decoder (more details will be presented in Section \ref{sec:mm_cot}). Interestingly, with vision features, the RougeL score of the rationale generation has boosted to 96.97\% (QCM$\rightarrow$R), which correspondingly contributes to better answer accuracy of 84.91\% (QCMR$\rightarrow$A). % \footnote{Besides incorporating vision features, it is possible to \textcolor{blue}{use larger models to mitigate the issue (Appendix \ref{appendix:lms}). However, fine-tuning larger models is consuming and does not yield better performance than using vision features; thus, scaling is not the focus of this work.}} With those effective rationales, the phenomenon of hallucination is mitigated --- 62.5\% hallucination mistakes in Section \ref{sec:misleading} have been corrected (Figure \ref{fig_bar}(b)), as an example shown in Figure \ref{fig_pre_case1} (right part).\footnote{The left mistakes are mainly about map understanding, which requires more advanced vision features. We will discuss them in Section \ref{sec:case_studies}.} The analysis so far compellingly shows that vision features are indeed beneficial for generating effective rationales and contributing to accurate answer inference. {As the two-stage method (QCMR$\rightarrow$A) in Table \ref{tab:pre_decoupled} achieves better performance than all the {one-stage} method in Table \ref{tab:pre_position}, we choose the two-stage method in our Multimodal-CoT framework.} \section{Multimodal-CoT}\label{sec:mm_cot} Based on the observations and discussions in Section \ref{sec:prelim}, {we propose Multimodal-CoT to incorporate language (text) and vision (images) modalities into a two-stage framework.} In this section, we will first overview the procedure of the framework and then elaborate on the technical design of the model architecture. \subsection{Framework Overview} Multimodal-CoT consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input $X$ and output $Y$. The overall procedure is illustrated in Figure~\ref{fig_overview}. We will take vision-language as an example to show how Multimodal-CoT works. In the rationale generation stage, we feed the model with $X=\{X_{\textrm{language}}^{1}, X_{\textrm{vision}}\}$ where $X_{\textrm{language}}^{1}$ represents the language input in the first stage and $X_{\textrm{vision}}$ represents the vision input, i.e., the image. For example, $X$ can be instantiated as a concatenation of question, context, and options of a multiple choice reasoning problem \citep{lu2022learn} as shown in Figure \ref{fig_overview}. The goal is to learn a rationale generation model ${R} = F(X)$ where ${R}$ is the rationale. In the answer inference stage, the rationale $R$ is appended to the original language input $X_{\textrm{language}}^{1}$ to construct the language input in the second stage, $X_{\textrm{language}}^{2} = X_{\textrm{language}}^{1} \circ {R}$ where $\circ$ denotes concatenation. Then, we feed the updated input $X'=\{X_{\textrm{language}}^{2}, X_{\textrm{vision}}\}$ to the answer inference model to infer the final answer $A = F(X')$. {In both stages, we train two models with the same architecture independently.} They take the annotated elements (e.g., $X\rightarrow R$, $XR\rightarrow A$, respectively) from the training set for supervised learning. During inference, given $X$, the rationales for the test sets are generated using the model trained in the first stage; they are used in the second stage for answer inference. \subsection{Model Architecture} Given the language input $X_{\textrm{language}} \in \{X_{\textrm{language}}^{1}, X_{\textrm{language}}^{2}\}$ and the vision input $X_{\textrm{vision}}$, we compute the probability of generating target text $Y$ (either the rationale or the answer in Figure~\ref{fig_overview}) of length $N$ by \begin{equation} \label{eq:gated} p(Y|X_{\textrm{language}},X_{\textrm{vision}}) = \prod_{i=1}^{N} p_{\theta}\left(Y_{i} \mid X_{\textrm{language}}, X_{\textrm{vision}}, Y_{