File size: 25,275 Bytes
dfea9ad
76a0baa
dfea9ad
76a0baa
 
 
 
 
 
dfea9ad
 
 
 
76a0baa
dfea9ad
 
 
 
 
 
 
12fa2c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfea9ad
 
 
 
 
 
 
 
dfa4fe2
0e1aeee
76a0baa
 
 
 
 
 
 
 
 
 
 
 
 
 
dfa4fe2
dfea9ad
 
0e1aeee
dfea9ad
76a0baa
dfea9ad
76a0baa
dfea9ad
 
 
 
0e1aeee
dfea9ad
76a0baa
dfea9ad
76a0baa
dfea9ad
0e1aeee
dfea9ad
 
0e1aeee
dfea9ad
76a0baa
dfea9ad
76a0baa
dfea9ad
0e1aeee
dfea9ad
 
 
 
 
 
 
 
 
 
12fa2c2
76a0baa
 
 
 
 
 
12fa2c2
76a0baa
 
 
 
 
12fa2c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a0baa
 
 
 
 
 
12fa2c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a0baa
 
 
 
dfea9ad
76a0baa
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="description" content="World-model-augmented (WMA) web agent that simulates action outcomes for better decision-making in long-horizon web tasks.">
  <meta name="keywords" content="Web Agents, World Models, LLMs, Web Navigation, Autonomous Agents">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Web Agents with World Models: Learning and Leveraging Environment Dynamics in Web Navigation</title>
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <script type="text/javascript" async 
    src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
  <script type="text/javascript" async 
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.0/es5/tex-mml-chtml.js"></script>
  <script>
    function showStep(stepNumber) {
      // Hide all step contents
      var stepContents = document.querySelectorAll('.step-content');
      for (var i = 0; i < stepContents.length; i++) {
        stepContents[i].style.display = 'none';
      }
      
      // Remove active class from all tabs
      var tabs = document.querySelectorAll('.tabs li');
      for (var i = 0; i < tabs.length; i++) {
        tabs[i].classList.remove('is-active');
      }
      
      // Show the selected step content and activate its tab
      document.getElementById('step' + stepNumber + '-content').style.display = 'block';
      document.getElementById('step' + stepNumber + '-tab').classList.add('is-active');
    }
    
    // Initialize when DOM is fully loaded
    document.addEventListener('DOMContentLoaded', function() {
      showStep(1);
    });
  </script>
</head>
<body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">Web Agents with World Models: Learning and Leveraging Environment Dynamics in Web Navigation</h1>
          <div class="publication-links">
            <div class="is-size-5 publication-authors">
              <span class="author-block"><a href="#">Hyungjoo Chae</a>,</span>
              <span class="author-block"><a href="#">Namyoung Kim</a>,</span>
              <span class="author-block"><a href="#">Kai Tzu-iunn Ong</a>,</span>
              <span class="author-block"><a href="#">Minju Gwak</a>,</span>
              <span class="author-block"><a href="#">Gwanwoo Song</a>,</span>
              <span class="author-block"><a href="#">Jihoon Kim</a>,</span>
              <span class="author-block"><a href="#">Sunghwan Kim</a>,</span>
              <span class="author-block"><a href="#">Dongha Lee</a>,</span>
              <span class="author-block"><a href="#">Jinyoung Yeo</a></span>
            </div>
            <div class="is-size-5 publication-affiliations">
              <span class="affiliation-block">Yonsei University</span>
            </div>
          </div>
          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- Paper Link -->
              <span class="link-block">
                <a href="https://arxiv.org/abs/2410.13232" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <!-- Code Link -->
              <span class="link-block">
                <a href="https://github.com/kyle8581/WMA-Agents" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                </a>
              </span>
              <!-- Demo Link -->
              <span class="link-block">
                <a href="https://huggingface.co/spaces/hyungjoochae/WMA-Agent-Demo" target="_blank" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="fas fa-play"></i>
                  </span>
                  <span>Demo</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- 
<section class="hero is-light">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column">
          <div class="content has-text-centered">
            <img src="static/images/figure1_wma_overview.png" alt="WMA Overview">
          </div>
        </div>
      </div>
    </div>
  </div>
</section> -->
<section class="section" style="padding-top: 0px;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <div class="box has-background-white" style="box-shadow: 0 0.5em 1em -0.125em rgba(10, 10, 10, 0.3), 0 0 0 1px rgba(10, 10, 10, 0.05);">
          <h2 class="title is-2">Overview</h2>
          <br>
          <div class="content has-text-centered">
            <img src="static/images/figure1_wma_overview.png" alt="WMA Web Agent">
          </div>
          <p>Large language models (LLMs) have recently gained much attention in building autonomous agents. However, the performance of current LLM-based web agents in long-horizon tasks is far from optimal, often yielding errors such as repeatedly buying a non-refundable flight ticket. By contrast, humans can avoid such an irreversible mistake, as we have an awareness of the potential outcomes (e.g., losing money) of our actions, also known as the <strong>"world model"</strong>. Motivated by this, our study first starts with preliminary analyses, confirming the absence of world models in current LLMs (e.g., GPT-4, Claude-3.5-Sonnet, etc.). Then, we present a <strong>World-Model-Augmented (WMA) web agent</strong>, which simulates the outcomes of its actions for better decision-making. To overcome the challenges in training LLMs as world models predicting next observations, such as repeated elements across observations and long HTML inputs, we propose a transition-focused observation abstraction, where the prediction objectives are free-form natural language descriptions exclusively highlighting important state differences between time steps. Experiments on <a href="https://github.com/web-arena-x/webarena">WebArena</a> and <a href="https://github.com/OSU-NLP-Group/Mind2Web">Mind2Web</a> show that our world models improve agents' policy selection without training and demonstrate our agents' cost- and time-efficiency compared to recent tree-search-based agents.
          </p>
          <br>
          <div class="content">
            <h3 class="title is-4">🌍 News</h3>
            <ul>
              <li><strong>[2025/01/22] WMA Web Agent is accepted by ICLR 2025!</strong></li>
              <li><strong>[2024/06/12] WMA Web Agent is out!</strong></li>
            </ul>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Methodology</h2>
        <div class="tabs">
          <ul>
            <li id="step1-tab" class="is-active"><a href="#" onclick="showStep(1); return false;">Phase I: World Model Training</a></li>
            <li id="step2-tab"><a href="#" onclick="showStep(2); return false;">Phase II: Inference-Time Policy Optimization with the World Model</a></li>
          </ul>
        </div>
        <div id="step1-content" class="content step-content">
          <div class="content has-text-centered">
            <img src="static/images/method_1.png" alt="World Model Training Methodology">
          </div>
          <h3 class="title is-4">Step I: Harvesting Agent-Environment Interaction Data</h3>
          <p>
            <p>
              We start by collecting the dataset 
              \( \mathcal{D} = \sum^{n}_{t=1} \{ I, o_t, a_t, o_{t+1} \} \) 
              from the environment \( \mathcal{E} \) for training world models.
              For that, we prompt an LLM as a web agent to achieve the goal provided in the user instruction \( I \),
              by iteratively predicting an action \( a_t \) based on the current observation \( o_t \) 
              throughout all \( n \) time steps.
              Consequently, we obtain \( \mathcal{D} \) from trajectory 
              \( \tau = \{o_1, a_1, o_2, ..., a_{n}, o_{n+1}\} \) based on \( I \),
              and environment states of \( n \) time steps 
              \( \{s_1, ..., s_{n+1}\} \subset \mathcal{S} \) 
              obtained via transition function \( \mathcal{T} \).
          </p>
          
          </p>
          <h3 class="title is-4">Step II: Transition-Focused Observation Abstraction</h3>
          <p>
            With the collected data 
    \( \mathcal{D} = \sum^{n}_{t=1} \{ I, o_t, a_t, o_{t+1} \} \), 
    it is intuitive to train LLM-based world models to predict \( o_{t+1} \),
    which is expressed with texts (e.g., HTML and accessibility tree).
          </p>
          <div class="has-text-centered">
            <figure class="image" style="width: 80%; margin: 0 auto;">
              <img src="static/images/figure_5.png" alt="Figure 5: Transition-Focused Observation Abstraction">
            </figure>
          </div>
          <p>As shown in Figure 5, we first (i) apply the Hungarian algorithm
            to calculate a cost matrix for matching elements between 
            \( o_t \) and \( o_{t+1} \) and (ii) mechanically transform the results into a list of state transition 
            \( \Delta(o_t, o_{t+1}) \), pointing out <code>UPDATED</code>, <code>DELETED</code>, and <code>ADDED</code> elements on the web.
            After that, we prompt an LLM to convert the extracted \( \Delta(o_t, o_{t+1}) \) into a free-form natural language 
            description \( \tilde{o}_{t+1} \), which highlights the difference between the new observation \( o_{t+1} \) and \( o_t \).
            Replacing \( o_{t+1} \) in 
            \( \mathcal{D} = \{ I, o_t, a_t, o_{t+1} \} \) collected in Step I with \( \tilde{o}_{t+1} \) we just acquired here, 
            we get a final dataset 
            \( \tilde{\mathcal{D}} = \sum^{n}_{t=1} \{ I, o_t, a_t, \tilde{o}_{t+1} \} \) 
            for training world models.</p>
          <h3 class="title is-4">Step III: Learning Environment Dynamics</h3>
          <p>
            Lastly, using \( \tilde{\mathcal{D}} \), we proceed to train the internal world model \( \phi \) of the web agent 
            to learn the environment dynamics. Formally, an LLM working as the world model is trained to predict 
            the abstracted observation \( \tilde{o} \) of the next state \( s_{t+1} \), given three inputs: 
            the user instruction \( I \), the current observation \( o_t \), and the current action \( a_t \).
            This LLM is trained to minimize the following loss term via the next-token prediction objective:
          </p>
          <p>
              \[
              \mathcal{L}_{\phi} = -\log \sum_{(\tilde{o}, o, a, I) \in \tilde{\mathcal{D}}} p(\tilde{o}_{t+1}| o_t, a_t, I)
              \]
          </p>
        </div>
        <div id="step2-content" class="content step-content" style="display: none;">
          <div class="content has-text-centered">
            <img src="static/images/method_2.png" alt="Inference-Time Policy Optimization with the World Model">
          </div>
          <p>
            During inference at time \( t \) with a current observation \( o_t \), the WMA web agent utilizes the world model \( \phi \) to foresee how an action can affect the state (i.e., predict \( \tilde{o}_{t+1}^i \)), and accordingly finds an optimal action/policy \( a_t \) from the policy model \( \theta \) that leads to the target goal defined in \( \mathcal{I} \).
          </p>
          <p>
            We begin by sampling \( k \) action candidates 
                \( \{a_t^1, a_t^2, ..., a_t^k\} \) from \( \theta \) via top-\( p \) decoding, to conduct diverse exploration on future observations 
                \( \{o_{t+1}^1, o_{t+1}^2, ..., o_{t+1}^k\} \).
            Next, we use the world model \( \phi \) to "<em>simulate</em>" the potential next observation \( \tilde{o}_{t+1}^i \) caused by each action candidate \( a_t \):
            </p>

            <p>
                \[
                \{\tilde{o}_{t+1}^i\}_{i=1}^k = \{\phi(o_t, a_t^i, I)\}_{i=1}^k
                \]
            </p>

            <p>
                Lastly, we decide the agent's action for actual operation by selecting the action leading to the most optimal future state \( s_{t+1} \) from all action candidates. 
                We use an off-the-shelf LLM as a value function \( V(\cdot) \) to estimate the reward yielded by each action candidate 
                and select the action \( \hat{a}_t \) with the highest reward:
            </p>
            <p>
              \[
              \hat{a}_t = \underset{a_t \in \{a_t^1, ..., a_t^k\}}{\text{argmax}} \, V(I, o_t, a_t, \tilde{o}_{t+1}^i)
              \]
            </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" style="padding-top: 0; padding-bottom: 0;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <hr style="height: 2px; background-color: #dbdbdb; margin: 2rem 0;">
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Experiments Setup</h2>
        <div class="content">
          <h3 class="title is-4">Benchmarks and evaluation metrics</h3>
          <p>For evaluation, we use the official <a href="https://github.com/web-arena-x/webarena">WebArena</a> and
            <a href="https://github.com/OSU-NLP-Group/Mind2Web">Mind2Web</a> benchmarks. WebArena includes 812 real-life tasks
            in simulated environments across five different websites, spanning four key domains - e-commerce
            (Shopping), social forums (Reddit), collaborative software development (Gitlab), content manage-
            ment (CMS), and Map. The main
            metric, Success Rate (SR), is calculated as the percentage of the user instructions that are success-
            fully accomplished by the generated agent trajectory. On the other hand, Mind2Web covers over 2,000 open-ended tasks, collected from 137 websites of 31 domains and crowd-
            sourced action sequences for the tasks. Along with the SR, Mind2Web also uses Step SR, which
            measures whether the predicted action selects both the correct action type (action F1) and element
            ID (element accuracy). When the agent succeeds in all steps in a trajectory, it is evaluated as success.</p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" style="padding-top: 0; padding-bottom: 0;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <hr style="height: 2px; background-color: #dbdbdb; margin: 2rem 0;">
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Results</h2>
        <div class="content">
          <h3 class="title is-4">Agent Performance in WebArena</h3>
          <div class="has-text-centered">
            <figure class="image">
              <img src="static/images/table_1.png" alt="Table 1">
            </figure>
            <figure class="image">
              <img src="static/images/table_2.png" alt="Table 2" style="width: 90%; margin: 0 auto;">
            </figure>
          </div>
          <br>
          <p>From our experiments in Table 1 and Table 2, we observed the following results:</p>
          <ul>
            <li><strong>WMA vs. Vanilla CoT</strong>
              <ul>
                <li>WMA web agent achieves a 16.6% success rate compared to 13.1% for vanilla CoT.</li>
                <li>Significant improvements are observed across almost all domains in WebArena (see Table 2).</li>
              </ul>
            </li>
            <br>
            <li><strong>Performance Gains with GPT-4o-mini</strong>
              <ul>
                <li>181% performance gain over CoT in the Gitlab domain.</li>
                <li>92% performance gain over CoT in the Map domain.</li>
              </ul>
            </li>
            <br>
            <li><strong>Comparison with Tree Search Agent (Koh et al., 2024)</strong>
              <ul>
                <li>The Tree search agent has a slightly higher absolute success rate (19.2%) compared to the WMA agent (16.6%).</li>
                <li>The WMA agent shows a larger performance improvement over vanilla CoT (+29.7%) than the Tree search agent (+28.0%).</li>
              </ul>
            </li>
          </ul>
        </div>
        <br>
        <div class="content">
          <h3 class="title is-4">Agent Performance in Mind2Web</h3>
          <figure class="image">
            <img src="static/images/table_3.png" alt="Table 3">
          </figure>
          <p>From our experiments in Table 3, we observed the following results:</p>
          <ul>
            <li><strong>Comparison with Previous SOTA Methods</strong>
              <ul>
                <li>WMA web agent is compared with MindAct (Deng et al., 2024) and AWM (Wang et al., 2024b).</li>
                <li>WMA web agent significantly outperforms AWM, achieving new SOTA performance.</li>
              </ul>
            </li>
            <br>
            <li><strong>Generalization Capability of WMA</strong>
              <ul>
                <li>WMA web agent, trained on Mind2Web data, shows strong generalization capabilities.</li>
                <li>This makes our approach much more valuable in scenarios where
                  collecting data for new web environments is non-trivial.</li>
              </ul>
            </li>
          </ul>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" style="padding-top: 0; padding-bottom: 0;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <hr style="height: 2px; background-color: #dbdbdb; margin: 2rem 0;">
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Analysis</h2>
        <div class="content">
          <h3 class="title is-4">Time and Cost Efficiency</h3>
          <div class="has-text-centered">
            <figure class="image">
              <img src="static/images/table_4.png" alt="Table 4">
            </figure>
          </div>
          <ul>
            <li><strong>Time Efficiency</strong>
              <ul>
                <li>Tree search agent takes an average of 748.3 seconds per user instruction due to state exploration and backtracing.</li>
                <li>WMA web agent completes the same task in only 140.3 seconds by simulating actions instead of executing them.</li>
                <li>WMA is 5.3 times faster than Tree search agent.</li>
              </ul>
            </li>
            <br>
            <li><strong>API Cost Efficiency</strong>
              <ul>
                <li>Tree search agent incurs 6.8 times higher API costs due to its multi-modal inputs.</li>
              </ul>
            </li>
          </ul>
        </div>
        <br>
        <div class="content">
          <h3 class="title is-4">Ablation Study</h3>
          <p>
            We conduct several ablation studies on our WMA web agent with 200 randomly sampled instances from WebArena (Shopping: 50; Gitlab: 50; Map: 100). We use GPT-4o-mini as policy models.
          </p>
          <div class="has-text-centered">
            <figure class="image" style="width: 90%; margin: 0 auto;">
              <img src="static/images/table_5.png" alt="Table 5">
            </figure>
          </div>
          <br>
          <p>
            We observe the following findings in Table 5:
            <ul>
              <li>Accessing simulated next states in reward estimation improves agent performance.</li>
              <li>Fine-tuning facilitates better world models than prompt-based approaches.</li>
              <li>Abstracting observation elicits better next state prediction.</li>
            </ul>
          </p>
          <br>
          <div class="has-text-centered">
            <div class="columns is-centered">
              <div class="column is-half has-text-centered">
                <figure class="image" style="width: 80%; margin: 0 auto;">
                  <img src="static/images/table_6.png" alt="Table 6">
                </figure>
              </div>
              <div class="column is-half has-text-centered">
                <figure class="image" style="width: 80%; margin: 0 auto;">
                  <img src="static/images/figure_6.png" alt="Figure 6: Qualitative Analysis">
                </figure>
              </div>
            </div>
          </div>
          <p>
            Additionally, we reveal the following findings in Table 6 and Figure 6:
            <ul>
              <li>Fine-tuning the value function is a reasonable alternative in scenarios where API budgets are limited.</li>
              <li>Our WMA web agent may benefit from more exploration of the future states when the budget is allowed.</li>
            </ul>
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section" style="padding-top: 0; padding-bottom: 0;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <hr style="height: 2px; background-color: #dbdbdb; margin: 2rem 0;">
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Case Study</h2>
        <div class="content">
          <p>
            WMA web agent successfully inferences on Gitlab
domain in the WebArena benchmark (instance #175). Using the policy model (i.e., GPT-4o), WMA
web agent selects the most proper action click [88] by leveraging its learned environment dynamics.
          </p>
          <div class="has-text-centered">
            <figure class="image" style="width: 100%; margin: 0 auto;">
              <img src="static/images/case.png" alt="Case Study Example">
            </figure>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section" style="padding-top: 0; padding-bottom: 0;">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <hr style="height: 2px; background-color: #dbdbdb; margin: 2rem 0;">
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2">Citation</h2>
        <div class="content">
          <pre style="white-space: pre-wrap; word-wrap: break-word;"><code>@inproceedings{chae2024web,
  title={Web agents with world models: Learning and leveraging environment dynamics in web navigation},
  author={Chae, Hyungjoo and Kim, Namyoung and Ong, Kai Tzu-iunn and Gwak, Minju and Song, Gwanwoo and Kim, Jihoon and Kim, Sunghwan and Lee, Dongha and Yeo, Jinyoung},
  booktitle={The Thirteenth International Conference on Learning Representations}
  }</code></pre>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column">
        <h2 class="title is-2
::contentReference[oaicite:2]{index=2}