@article{lappe2025brixelwallcheaperdense,
author = "Alexander Lappe and Martin A. Giese",
abstract = "Vision foundation models achieve strong performance on
both global and locally dense downstream tasks. Pretrained
on large images, the recent DINOv3 model family is able
to produce very fine-grained dense feature maps, enabling
state-of-the-art performance. However, computing these
feature maps requires the input image to be available at very
high resolution, as well as large amounts of compute due to
the squared complexity of the transformer architecture. To
address these issues, we propose BRIXEL, a simple knowl-
edge distillation approach that has the student learn to re-
produce its own feature maps at higher resolution. Despite
its simplicity, BRIXEL outperforms the baseline DINOv3
models by large margins on downstream tasks when the res-
olution is kept fixed. Moreover, it is able to produce feature
maps that are very similar to those of the teacher at a frac-
tion of the computational cost",
journal = "arXiv",
title = "{A}nother {BRIXEL} in the {W}all: {T}owards {C}heaper {D}ense {F}eatures",
url = "https://arxiv.org/abs/2511.05168",
year = "2025",
}