@Article{druon:ral:2020, author = {Druon, Raphael and Yoshiyasu, Yusuke and Kanezaki, Asako and Watt, Alassane}, title = {Visual Object Search by Learning Spatial Context}, journal = {IEEE Robotics and Automation Letters}, year = {2020}, volume = {5}, number = {2}, pages = {1279--1286}, month = {April}, doi = {10.1109/lra.2020.2967677}, keywords = {Navigation, Visualization, Feature extraction, Task analysis, Search problems, Semantics, Three-dimensional displays}, abstract = {We present a visual navigation approach that uses context information to navigate an agent to find and reach a target object. To learn context from the objects present in the scene, we transform visual information into an intermediate representation called context grid which essentially represents how much the object at the location is semantically similar to the target object. As this representation can encode the target object and other objects together, it allows us to navigate an agent in a human-inspired way: the agent will go to the likely place by seeing surrounding context objects in the beginning when the target is not visible and, once the target object comes into sight, it will reach the target quickly. Since context grid does not directly contain visual or semantic feature values that change according to introductions of new objects, such as new instances of the same object with different appearance or an object from a slightly different class, our navigation model generalizes well to unseen scenes/objects. Experimental results show that our approach outperforms previous approaches in navigating in unseen scenes, especially for broad scenes. We also evaluated human performances in the target-driven navigation task and compared with machine learning based navigation approaches including this work.}, publisher = {IEEE-INST Electrical Electronics Engineers Inc}, address = {445 Hoes Lane, Piscataway, NJ 08855-4141, USA} }