\documentclass[pmlr,twocolumn]{jmlr}

\usepackage{longtable}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{graphicx}
\jmlryear{2025}
\jmlrproceedings{TerraBytes@ICML}{Proceedings of TerraBytes Workshop at the 42nd International Conference on Machine Learning}


\title[Shaping Fine-Tuning of Geospatial Foundation Models]{Shaping Fine-Tuning of Geospatial Foundation Models: Effects of Label Availability and Temporal Resolution}

  \author{\Name{Giovanni Castiglioni} \Email{giovanni.castiglioni@cenia.cl}\\
  \addr Department of Computer Science, Universidad de Chile, Santiago, Chile \\
  Centro Nacional de Inteligencia Artificial, Macul, Chile
  \AND
  \Name{Nicolás Isla} \\
  \addr Centro de Información de Recursos Naturales, Santiago, Chile \\
  Department of Computer Science, Universidad de Chile, Santiago, Chile
  \AND
  \Name{Cristian {B. Calderon}} \\
  \addr Centro Nacional de Inteligencia Artificial, Macul, Chile
  \AND
  \Name{Javiera Castillo-Navarro} \\
  \addr CEDRIC, Conservatoire National des Arts et Métiers, Paris, France
  \AND
  \Name{Sébastien Lefèvre}\\
  \addr IRISA, Université Bretagne Sud, UMR 6074, Vannes, France \\
  UiT -- The Arctic University of Norway, Tromsø, Norway
  \AND
  \Name{Valentin Barriere}\\
  \addr Department of Computer Science, Universidad de Chile, Santiago, Chile \\
  Centro Nacional de Inteligencia Artificial, Macul, Chile}

\begin{document}

\maketitle

\begin{abstract}
Fine-tuning foundation models is a key step in adapting them to a particular task. In the case of Geospatial Foundation Models (GFMs), fine-tuning can be particularly challenging given data scarcity both in terms of the amount of labeled data and, in the case of Satellite Image Time Series (SITS), temporal context. Under these circumstances, the optimal GFM fine-tuning strategy across different labeled data regimes remains poorly understood. In this paper, we thoroughly assess and study the performances of two different GFMs given several combinations of two data scarcity factors: the number of labeled samples and the sequence length. Specifically, we analyze the performances on a crop classification task, particularly, semantic segmentation of the Sentinel-2 images contained in the PASTIS-HD dataset. We compare GFMs to U-TAE, as a fully supervised baseline, across varying amounts of labeled data (1\%, 10\%, 50\%, 100\%) and temporal input lengths (1, 6, 15, 25 and 35). Among these explorations, we find that using a smaller learning rate for the pre-trained encoders improves performance in moderate and high data regimes (50\%-100\%). In contrast, full fine-tuning outperforms partial fine-tuning in very low-label settings (1\%-10\%). This behavior suggests a nuanced trade-off between feature reuse and adaptation that defies the intuition of standard transfer learning. The code is available \href{https://github.com/GioCastiglioni/ShapingFT}{here}.
\end{abstract}
\begin{keywords}
Foundation Models, Fine-Tuning, Time-Series, Data Scarcity
\end{keywords}

\section{Introduction}
\label{sec:intro}

Self-supervised learning (SSL) allows the encoding of knowledge into the parameters of so-called foundation models, which through this kind of training leverages large unlabeled datasets to obtain high quality representations \citep{Devlin2018}. These models typically display powerful transfer learning capabilities, reaching higher performances in tasks where the data can be scarce in terms of labeled examples \citep{Yu2022a,Marszalek2022,Zou2023}. 
% 
Geospatial Foundation Models (GFMs) are instances of these type of models that have been pre-trained on huge datasets of Satellite Images or Satellite Image Time Series (SITS) \citep{DumeurThesis2024}. Processing of remote sensing data, which has traditionally focused on manual interpretation and task-specific models, has recently been revolutionized by the advent of these large-scale and pre-trained new methods \citep{Lu2024a}. 
% 
GFMs and other classical pre-trained models showed competitive performances in various tasks, including Crop Type Mapping \citep{Dumeur2024,Chang2024c}. 
They are known to be especially better in low-data and label regimes, which makes them useful as labeled data collection for geospatial applications can be expensive \citep{Rolf2024}.

% 
However, recent work suggests that GFMs perform worse than Task-Specific Models (TSMs) when labeled data are abundant, even though foundation models typically have two to three orders of magnitude more parameters \citep{Marsocci2024pangea}. These results position machine learning for satellite data as a unique testbed, where typical methods and techniques, which resulted beneficial in some modalities, may not necessarily apply to satellite images \citep{Rolf2024}. As such, in this study, we take a deeper dive into this phenomenon, focusing specifically on the task of Crop Type Mapping \citep{Garnot2021}, auditing two differently complex GFMs, and evaluating whether the performance relation between GFMs and task-specific models is agnostic to the data regime (i.e., small training set and/or short time series). The latter point is motivated by the observation that fine-tuning of large pre-trained models can be instable, particularly in the case of data scarcity \citep{Mosbach2021,Zhang2021a}.  

% 
In this study, we systematically compare two GFMs and a TSM (acting as baseline), and first propose to study the effect of the input data sequence length effect, in order to evaluate how much context is needed to reach respectable performances. Second, we propose to study the effect of label scarcity and compare the performance of these models when fine-tuned with parametrically controlled decreasing amounts of data. 

% 
We found that GFMs are competitive with TSMs for Crop Type Mapping even in the case of a large fine-tuning dataset. Nonetheless, the length of the input data sequences affects the performance of GFMs and TSMs in a similar way, i.e., accuracy patterns plateau in a logarithmic manner.

%
Additionally, we find that the generalization capabilities of GFMs crucially depend on the fine-tuning strategy, under particular data regimes. In particular, this strategy consists of weighting the parameter update of the pretrained models, specifically scaling down the learning rate of the encoder. This is done to control the alignment of the latter to the task-specific requirements in terms of feature extraction, manipulating the trade-off between perturbing the already learned features to solve a downstream task, and leveraging those features when the data scarcity conditions or the fine-tuning strategy are not sufficient to accomplish the downstream task.

%
We list the main contributions of this work as follows:

\begin{itemize}
    \item Benchmarking two differently complex geospatial foundation models (CROMA \citep{Fuller2023}, SSL4EO’s DINO \citep{Stewart2023}) against a fully-supervised U-TAE baseline \citep{Garnot2021} in the PASTIS-HD Sentinel-2 dataset under four label budgets and five temporal resolutions, providing a thorough analysis in real-world data constraints. In this context, we found that even with abundant labels, some fine-tuned models can exhibit a consistent advantage over the task-specific model.
    \item We propose to use a specific fine-tuning strategy where a hyperparameter controls the learning rate of the encoder relative to the decoder, isolating the effect of encoder adaptation on downstream performance and allowing us to evaluate how much tuning is beneficial under different data regimes. 
\end{itemize}



\section{Related Works}
\label{sec:related}

\paragraph{Crop Segmentation}
\citet{Garnot2021} were among the first to propose a large scale dataset ($>$124k parcels) of crop segmentation for deep neural networks. They introduced the PASTIS dataset, and the U-TAE, a model based on the aggregation of a U-Net and a Temporal Attention Encoder (TAE) in order to adapt the U-Net architecture to SITS. This dataset was augmented a few years later with images from Sentinel-1 into PASTIS-R \citep{Sainte2021a}, and with images from Very High Resolution \citep{Garioud2023}. 
\citet{Rustowicz2019} also propose to tackle this task, using sparse ground truth labels composing 4 or 5 classes in Sudan and Ghana. 

%
Nowadays, there has been a switch to larger pre-trained SSL models \citep{Yuan2021}. 
\citet{Dumeur2024a,Dumeur2024,DumeurThesis2024} proposed several architectures using SSL methods applied to crop segmentation, such as the Unet-BERT spAtio-temporal Representation eNcoder (U-BARN) to exploit irregularly sampled SITS. They use dense sequences (up to 100 timesteps) to classify the pixels.   

%
Recently, \citet{Reuss2025} proposed to study few-shot time series classification with basic transformers using the EuroCropsML dataset  between Portugal, Estonia and Latvia \citep{Reuss2024}. \citet{Barriere2023RSE,Barriere2022cdceo} consider the same crop taxonomy to study few-shot learning between France and the Netherlands. Both highlighted the importance of domain adaption or pre-training of the model on domain data. 

\paragraph{Foundation Models}

Many works proposing GFMs have been published in the last months (see \citet{Lu2024a} for 
 a review of remote sensing foundation models). They rely on training SSL models on huge datasets \citep{Nedungadi2024,Guo2023a,Bastani2023}. 

Prithvi and Prithvi2.0 \citep{Jakubik2023a,Szwarcman2024} are the largest publicly available models at the time, with 600M parameters. 
% 
\citet{Fuller2023} propose Contrastive Radar-Optical Masked Autoencoders (CROMA), which jointly learn two modalities (Radar and Optical) using both contrastive and masked reconstruction losses. 
% 
Copernicus-FM \citep{Wang2025} is a model fully dedicated to Copernicus data, such as Sentinel-1, Sentinel-2, or Sentinel-5. 
% 
Still with Sentinel data, series of datasets called SSL4EO-S12 have been released \citep{Wang2023j,Blumenstiel2025} and used to train models based on SSL architectures like DINO \citep{Caron2021}, MAE \citep{He2022a} and MoCo \citep{ChenMoco2021}. Moreover, Vision Transformers (ViTs) have also been used succesfully. For instance, \citet{Tarasiou2023} propose a ViT that splits satellite image time series into temporo-then-spatial patches, uses date-aware positional encodings and multiple class tokens, and processes them with factorized attention. Their model outperforms CNN/RNN baselines in crop-type segmentation and classification tasks by wide margins with similar model size and inference time. Similarly, \citet{Bountos2023} propose FoMo-Bench, a unified forest monitoring benchmark, useful to assess the ability of GFMs. Furthermore, these authors propose FoMo-Net, a sensor-agnostic ViT pretrained to fuse optical, SAR, LiDAR and other bands across scales, achieving strong performance on zero-/few-shot classification, segmentation and detection tasks. Yet, no RSFM has exhibited universal superiority across all downstream tasks so far \citep{Adorni2025}. 

%
\paragraph{GFMs for Crop Segmentation}

Regarding agriculture applications of GFMs, \citet{Chang2024c} are studying the generalizability of GFM for Crop Type Mapping and proposing the Crop Type Bench. They compare SSL4EO-S12 and SatlasPretrain on a benchmark composed of several datasets for crop segmentation, however they do not consider the time series while temporal information is of paramount importance for distinguishing between crop types.  

%
AnySat \citep{Astruc2024a} and OmniSat \citep{Astruc2024} are two SSL methods focusing on jointly learning multimodal representation by exploiting the alignments of the modalities. The networks remain performant at inference phase with one modality only. AnySat, based on joint embedding predictive architecture (JEPA; \citealt{Assran2023}), obtains state-of-the-art results on the entire PASTIS dataset. 

% 
Galileo (Global and Local Flexible Earth Observation models; \citealt{Tseng2025}) is a GFM also evaluated on the PASTIS dataset, without fully fine-tuning the model but only doing linear probing. \citet{Guo2023a} propose SkySense, a GFM that jointly learns from time-series optical (RGB + multispectral) and synthetic aperture radar (SAR) data. Their model was trained on millions of spatiotemporal sequences, and uses a factorized spatiotemporal encoder, multi-granularity contrastive learning, and geo-context prototypes to create transferable pixel-, object- and image-level features. Evaluated on distinct tasks (e.g., segmentation, detection, change detection, crop mapping), SkySense outperformed several prior remote-sensing foundation models.

%
\citet{Nedungadi2024} propose MMEarth, a multi-modal global dataset used to train a multi-pretext masked autoencoder that reconstructs diverse pixel- and image-level signals to learn representations for Sentinel-2 imagery. Their model outperforms models pretrained on ImageNet and on single-modality satellite-images on several land-cover classification and segmentation benchmarks, especially in low-label settings.

%
Of particular relevance to our work, \citet{Marsocci2024pangea} introduce PANGAEA, a globally diverse benchmark that spans multiple domains, sensor modalities, resolutions and temporalities to standardize GFM evaluation. Their result suggest that current GFMs, although versatile, often fail to consistently outperform simpler supervised baselines, especially when their pre-training data poorly match downstream tasks distribution \citep{Rolf2021}, highlighting the need for more robust multi-modal, multi-temporal pre-training.  


\section{Methodology}
\label{sec:method}

\subsection{Models}

In this work, we considered two different GFMs based on their representativeness of the existing SSL families \citep{balestriero2023cookbookselfsupervisedlearning}, model complexity (see Tab.~\ref{tab:params}) and promising previous results shown by \citet{Marsocci2024pangea}, and a state-of-the-art model on the relevant downstream task acting as a fully-supervised specialized network:

\begin{itemize}
    \item CROMA \citep{Fuller2023}: A GFM pre-trained via masked auto-encoding and contrastive learning on 3M patches of multispectral Sentinel-2 data.
    \item SSL4eo-DINO \citep{Blumenstiel2025}: A GFM pre-trained with self-supervised learning on 3M patches of EO imagery using DINO, a self-supervised method based on auto-distilled representations.
    \item U-TAE \citep{Garnot2021}: A supervised baseline with temporal attention, trained from scratch on each data regime.
\end{itemize}

\begin{table}[htbp]
\caption{Model sizes in terms of Trainable Parameters for two configurations: frozen encoder and whole network.}
\centering
\begin{center}
\begin{tabular}{| l | c | c |}
\hline
 \multirow{2}{*}{Model}                & \multicolumn{2}{c|}{\# of Trainable Parameters (M)} \\
 \cline{2-3}
                      & Only Decoder      & Whole network \\ \hline
 CROMA                & 46.95      & 350.0 \\ 
 DINO                 & 30.89      & 53.5 \\ 
 U-TAE                 & -          & 1.1 \\ \hline
 
\end{tabular}
\end{center}
\label{tab:params}
\end{table}


In order to compare the models fairly and adapt them to SITS, we conducted the experiments for both pretrained models using the same decoder architecture, which is an UPerNet \citep{XiaoUperNet2018}, and performing temporal aggregation using a Time-Attention Encoding module to aggregate w.r.t time, following the same methodology adopted by \citet{Marsocci2024pangea}. 

\paragraph{FT-Rate} We control the adaptation of pre-trained encoders via a hyperparameter defined as the FT-Rate, which scales the encoder's learning rate relative to the untrained decoder’s. This means that FT-Rate $=0.0$ freezes the encoder, FT-Rate $=0.1$ applies a 10x smaller learning rate, and FT-Rate $=1.0$ uses the same learning rate for both the encoder and the decoder.

\subsection{Data Scarcity}

To assess the fine-tuning of GFMs, we conducted experiments on the PASTIS-HD dataset based on multi-temporal Sentinel-2 imagery. The dataset comprises 2,433 agricultural parcels in France with pixel-level annotations.

%
To simulate varying levels of supervision, we sub-sample the training set into four label regimes: 1\%, 10\%, 50\%, and 100\% of the available labeled parcels in the training set. To do this, similar to the stratified methodology adopted by \citet{Marsocci2024pangea}, we generate bins for each patch of the entire set, based on the amount of presence of a class within an image at a pixel-level. This results in a quantized histogram representing a coarse distribution of the classes.

\begin{figure}[!t]
\centerline{\includegraphics[width=1.0\linewidth] {imgs/distributions.pdf}}
\caption{Pixel-level distributions of the training set for each class, in the 4 different adopted label-scarce regimes (100\%, 50\%, 10\%, 1\%).}
\label{fig:distributions}
\end{figure} 

%
To preserve a distribution similar to that of the original training set, we compute the average quantized histogram of the dataset and select the desired percentage of samples with the smallest Jensen-Shannon divergence (JSD). The JSD is a symmetric and smoothed version of the Kullback–Leibler (KL) divergence and is defined as:
\[
\operatorname{JSD}(P \parallel Q) = \frac{1}{2} \operatorname{KL}(P \parallel M) + \frac{1}{2} \operatorname{KL}(Q \parallel M)\text{,}
\]
where $P$ is the average quantized histogram, $Q$ is the quantized histogram of a patch, and \( M = \frac{1}{2}(P + Q) \). The resulting distributions of this process, relative to each subset of the original training set, are shown in Fig. \ref{fig:distributions}.

This method differs from the one of \citet{Marsocci2024pangea}, which originally led to a high quantity of samples selected, even though the selected percentage of data was small, because at least one sample of each generated bin was included in the subset. With our proposed methodology, it is not necessary to include one sample of each bin, but only the desired percentage of samples that are most similar to the original distribution of the training set.


\subsection{Limited Sequence Length}

Five temporal depths (1, 6, 15, 25, 35 dates) were considered to test the capacity of pretrained models to capture phenological patterns. To simulate limited sequence length, we select 35 time instances, as evenly-spaced as possible, for each patch, and then generate nested subsets from those 35 selected time instances. Particularly, we consider subsets of sizes 1, 6, 15, 25 and 35. The subset with 1 temporal acquisition contains only the last available instance.

%
In order to ensure that the quantity of information contained in the smallest sequences is also contained in the largest ones, we create them in a nested way. Each subset, composed of samples extracted from the 35 originally selected instances, is defined by time instances as evenly-spaced as possible from the immediately larger set, i.e., 25 instances are selected from the original 35, then 15 instances are selected from those 25, and then 6 instances are selected from those 15, ensuring that every subset is contained in the larger ones in a nested manner that preserves temporal context.

\section{Experiments and Results}
\label{sec:exp}

\begin{figure*}[!t]
\floatconts
  {fig:pastisresults}
  {\caption{mIoU per number of instances for models trained on different levels of data scarcity. Each line corresponds to a specific model and encoder fine-tuning rate (FT-Rate), 0.0 corresponding to a fully frozen encoder. U-TAE is shown as a fully-supervised baseline without encoder pretraining.}}% Caption
  {%
    \subfigure[100\% of data][b]{%
      \label{fig:pastis100}
      \includegraphics[width=0.48\linewidth]{imgs/100.pdf}
    }\hfill
    \subfigure[50\% of data][b]{%
      \label{fig:pastis50}
      \includegraphics[width=0.48\linewidth]{imgs/50.pdf}
    }\\[0.5cm]
    \subfigure[10\% of data][b]{%
      \label{fig:pastis10}
      \includegraphics[width=0.48\linewidth]{imgs/10.pdf}
    }\hfill
    \subfigure[1\% of data][b]{%
      \label{fig:pastis1}
      \includegraphics[width=0.48\linewidth]{imgs/1.pdf}
    }
  }
\end{figure*}


\subsection{Label Availability}
\label{sec:label_availability}


Performance is classically evaluated on the basis of the achieved mean Intersection over Union (mIoU). We tested data-scarce scenarios, the first one being the least aggressive, with 50\% of the original training set included, while the validation and test sets remain the same. In this case, Table~\ref{tab:pastis} (right part) and Figure \ref{fig:pastis50} show patterns similar to those appreciated in Table \ref{tab:pastis} (left part) and Figure \ref{fig:pastis100} where 100\% of the data were used, with partial fine-tuning marginally and consistently outperforming other strategies for the larger encoder, while U-TAE remained above all the SSL4eo-DINO experiments using multi-temporal data.



\begin{table*}[!t]
\caption{mIoU per instance for CROMA, SSL4eo-DINO, and U-TAE trained with 100\% and 50\% of the Sentinel-2 labeled samples from the PASTIS dataset. FT-Rate denotes the factor applied to scale the encoder's learning rate relative to the decoder. The best result for each multi-temporal configuration is highlighted in \textbf{bold}.}
\centering
\setlength{\tabcolsep}{4pt}
\begin{center}
\setlength{\tabcolsep}{4pt}
\resizebox{.85\textwidth}{!}{
\begin{tabular}{| c | c | c c c c c | c c c c c |}
\hline
 \multirow{2}{*}{Model}             & \multirow{2}{*}{FT-Rate}  & \multicolumn{5}{c|}{100\% of data} & \multicolumn{5}{c|}{50\% of data}   \\
\cline{3-12}                                                                
                                    &            & 1          & 6         & 15        & 25        & 35   & 1          & 6         & 15        & 25        & 35     \\ \hline
                                    
 \multirow{3}{*}{CROMA}             & 0.0        & 16.10      & 43.40     & 53.37     & 55.66     & 56.71 & 14.83      & 39.83     & 47.43     & 51.02     & 52.15   \\
                                                             
                                    & 0.1        & \textbf{19.52}      & \textbf{49.65}     & \textbf{58.11}     & \textbf{60.41}     & \textbf{61.12}    & \textbf{17.44}      & \textbf{45.54}     & \textbf{53.08}     & \textbf{56.95}     & \textbf{57.50}   \\ 

                                    & 1.0        & 17.71      & 49.15     & 56.55     & 60.12     & 61.06   & 15.71      & 45.03     & 52.71     & 56.20     & 57.10    \\ \hline
                                    
 \multirow{3}{*}{DINO}              & 0.0        & 15.55      & 38.08     & 47.41     & 49.59     & 50.68    & 13.52      & 34.55     & 42.52     & 44.30     & 44.92     \\ 
                                                               
                                    & 0.1        & 16.43      & 42.50     & 51.17     & 54.03     & 55.19    & 14.36      & 37.34     & 46.98     & 48.65     & 50.24    \\
                                                             
                                    & 1.0        & 16.91      & 44.61     & 53.65     & 56.14     & 57.23    & 15.34      & 40.28     & 49.16     & 51.85     & 53.02    \\ \hline
                                    
 U-TAE                               & -          & 13.75      & 45.45     & 54.33     & 57.36     & 58.98    & 11.52      & 40.99     & 51.50     & 53.85     & 55.18   \\ \hline
 
\end{tabular}
}
\end{center}
\label{tab:pastis}
\end{table*}


%
The second data-scarce configuration explored was more aggressive, with 10\% of the training data being considered for all models. Table \ref{tab:pastis_scarce} (left part) and Figure \ref{fig:pastis10} show a change in the relation between partial fine-tuning and full fine-tuning, with the latter outperforming the former, and both being the best compared to the rest of the configurations. In this setting, the least complex model, SSL4eo-DINO, manages to reach the performance of U-TAE when fully fine-tuned using 35 instances, which did not happen in previous experiments, starting to exhibit the benefit that even small GFMs have over fully-supervised methods under label-scarcity.


%
Finally, we trained with 1\% of the original training data, making this configuration the most limited in terms of labels available for the model to learn to segment crop fields. Table~\ref{tab:pastis_scarce} (right part) and Figure \ref{fig:pastis1} show the advantage that GFMs have over the fully-supervised model.

\begin{figure*}[!t]
\centerline{\includegraphics[width=1.0\linewidth] {imgs/visualizations_label.pdf}}
\caption{Segmentations produced for three different images on two levels of temporal resolution in all the configurations of label scarcity. \textbf{Left to right} shows the original image, the ground truth label, and two rows of increasing label availability regimes, the \textbf{top row} shows results after training with 1 instance per patch, and the \textbf{bottom row} shows results after training with 35 instances per patch. }
\label{fig:visualizations_label}
\end{figure*}


\begin{figure*}[!t]
\centerline{\includegraphics[width=1.0\linewidth] {imgs/visualizations_temp.pdf}}
\caption{Segmentations produced for three different images on two levels of label availability in all the configurations of temporal resolutions. \textbf{Left to right} shows the original image, the ground truth label, and two rows of increasing temporal resolution regimes, the \textbf{top row} shows results after training with 1\% of the data, and the \textbf{bottom row} shows results after training with 100\% of the data. }
\label{fig:visualizations_temp}
\end{figure*}

%
Using this configuration, when it is fully fine-tuned, SSL4eo-DINO consistently outperforms U-TAE, which stalls its performance with the increasing number of instances. The latter is also notably surpassed by some settings of frozen CROMA encoders given sufficiently long time sequences. Moreover, these frozen encoders interestingly surpass the performance of some partially fine-tuned CROMA models.

%
In both the 1\% and 10\% label regimes, CROMA and SSL4eo-DINO significantly outperform U-TAE, even when the encoder is frozen in some cases, underscoring the benefit of large-scale pretraining when supervision is limited.

\begin{table*}[!t]
\caption{mIoU per instance for CROMA, SSL4eo-DINO, and U-TAE trained with 10\% and 1\% of the Sentinel-2 labeled samples from the PASTIS dataset. FT-Rate denotes the factor applied to scale the encoder's learning rate relative to the decoder. The best result for each multi-temporal configuration is highlighted in \textbf{bold}.}
\centering
\setlength{\tabcolsep}{4pt}
\begin{center}
\setlength{\tabcolsep}{4pt}
\resizebox{.85\textwidth}{!}{
\begin{tabular}{| c | c | c c c c c | c c c c c |}
\hline
 \multirow{2}{*}{Model}             & \multirow{2}{*}{FT-Rate}  & \multicolumn{5}{c|}{10\% of data} & \multicolumn{5}{c|}{1\% of data}  \\
\cline{3-12}                                                                
                                    &            & 1          & 6         & 15        & 25        & 35   & 1          & 6         & 15        & 25        & 35  \\ \hline
                                    
 \multirow{3}{*}{CROMA}             & 0.0        & 10.89      & 26.17     & 34.05     & 36.28     & 37.37  & 7.21       & 16.62     & 20.64     & 22.77     & 25.53\\
                                                            
                                    & 0.1        & \textbf{13.09}      & 30.07     & 37.89     & 39.43     & 40.67  & \textbf{7.30}       & 18.63     & 22.81     & 23.26     & 24.06\\ 

                                    & 1.0        & 12.47      & \textbf{32.73}     & \textbf{40.58}     & \textbf{42.29}     & \textbf{45.30}   & 6.44       & \textbf{20.81}     & \textbf{24.46}     & \textbf{25.64}     & \textbf{26.42} \\ \hline
                                    
 \multirow{3}{*}{DINO}              & 0.0        & 10.23      & 23.18     & 28.64     & 31.11     & 31.72  & 6.13       & 13.69     & 16.53     & 17.30     & 15.99 \\ 
                                                              
                                    & 0.1        & 11.00      & 25.06     & 32.83     & 35.79     & 36.09  & 6.52       & 14.56     & 17.69     & 19.12     & 18.71 \\
                                                              
                                    & 1.0        & 11.03      & 27.75     & 35.60     & 37.94     & 39.56   & 6.98       & 17.38     & 20.29     & 21.77     & 22.38\\ \hline
                                    
 U-TAE                               & -          &  9.24      & 28.66     & 37.33     & 38.68     & 39.44    & 5.43       & 18.96     & 19.45     & 19.62     & 21.40 \\ \hline
 
\end{tabular}
}
\end{center}
\label{tab:pastis_scarce}
\end{table*}



\subsection{Temporal Resolution}
\label{sec:temporal_resolution}

Across all models and label regimes, performance improves consistently and plateaus with the increment of the number of temporal observations. The greatest gains occur in low-label settings, highlighting the value of phenological information when supervision is scarce. This can be seen in Figure \ref{fig:pastisresults}, which shows a logarithmic behavior on the progression of mIoU across an increasing number of instances on all the regimes of labeled data.

%
We compare U-TAE along with CROMA and SSL4eo-DINO on the three fine-tuning configurations. For settings with both 100\% and with 50\% of training data, U-TAE is the second best model, outperforming every configuration of the SSL4eo-DINO encoder with considerably less parameter complexity, except for the one using mono-instance patches; as expected, given that the main feature of U-TAE lies on its Time-Attention Encoder module.


In the middle example of Figure \ref{fig:visualizations_temp}, the model trained on the entire dataset succeeds to detect sorghum (pink) when using at least 15 instances, highlighting the need of temporal context to detect classes at the tail of the distribution.


\subsection{Learning Rate Policy}
\label{sec:learning_rate_policy}

For the first case, Table \ref{tab:pastis} shows the mIoU per number of instances for the models and the fine-tuning strategies using 100\% of the original training set. FT-Rate indicates the scaling factor applied to the encoder's learning rate relative to the decoder. The results show that using a moderate fine-tuning strategy (i.e., FT-Rate = 0.1), although marginally, consistently improves the performance of CROMA when the number of instances increases, achieving the highest mIoU in all instances counts. In this sense, partial fine-tuning (FT-Rate=0.1) achieves superior performance for CROMA, the larger model, highlighting the benefit of gentle adaptation when ample labeled data are available for large encoders. This pattern is also observed in Table \ref{tab:pastis} (right part) and Figure \ref{fig:pastis50} for regimes using 50\% of the labels. These findings suggest a non-monotonic relationship between label availability and optimal FT-rate. With very few samples, full fine-tuning (FT-Rate $=1.0$) is beneficial. With moderate or ample data, conservative fine-tuning (FT-Rate $=0.1$) preserves useful representations. Freezing encoders (FT-Rate $=0.0$) offers a strong baseline, but usually underperforms partial or full adaptation.
These dynamics challenge the intuition from natural image transfer learning, highlighting the need for domain-specific strategies in remote sensing. Several factors may contribute to this unexpected result, e.g., with few labels a small learning rate may not sufficiently adapt neither the encoder, nor the randomly initialized decoder, which cannot compensate for features misaligned with the task. Furthermore, even pretrained representations may not align well with segmentation at high spatial resolution, so aggressive adaptation may help to correct this misalignment. These results also suggest that FT-Rate should be treated as a relevant hyperparameter and not fixed a priori.


\subsection{Semantic Map Visualization}
An overall visualization of the effects of varying the temporal resolution and the availability of labels on the segmentations that the models produce is shown in Figure \ref{fig:visualizations_label} and Figure \ref{fig:visualizations_temp}. Scarce regimes have less defined regions and struggle with the correct delineation of crop fields, generating rounded edges and poorly classified regions, even though the label can be correct, as shown in Figure \ref{fig:zoomed}. 

Conversely, richer configurations are more able to capture the actual parcellation and depiction of the landscape. Indeed, when comparing the triple parcels of sorghum, meadow and corn in Figure \ref{fig:zoomed_triplet}, we can see on the one hand the 100\% model needs the full data sequence to classify it perfectly. On the other hand, getting the full data sequence is not enough as the 35 instance models trained with 50\% of the data still strugle to get the three parcels right. 




\section{Conclusion}
\label{sec:ccl}

\begin{figure}[!t]
\floatconts
  {fig:zoomed}
  {\caption{Detailed comparison of ground truth and segmentations produced by data-scarce models. (a) portrays the poor definition of boundaries, even with correct classification. (b) shows the progressive improvement of the segmentations when enriching the data quality.}}
  {%
    \subfigure[Example of poorly delimited boundaries. \textbf{On the left} the original target is shown. \textbf{On the right}, the prediction of the model trained on 1\% of data with 35 instances is shown.][b]{%
      \label{fig:zoomed}
      \includegraphics[width=\linewidth]{imgs/zoomed.pdf}
    }\\[0.5cm]
    \subfigure[Progression of models under temporal scarcity (\textbf{on the top}) and label scarcity (\textbf{on the bottom}) converging into the ground truth target.][b]{%
      \label{fig:zoomed_triplet}
      \includegraphics[width=\linewidth]{imgs/zoomed_2.pdf}
    }
  }
\end{figure}

Our findings underscore that fine-tuning is crucial for both GFMs in the studied dense segmentation task. Pre-trained GFMs without adaptation performed poorly, whereas fine-tuned GFMs achieved large gains (e.g., CROMA’s reported mIoU boost). This contribution stresses that model adaptation must be a standard practice, in line with benchmarks that treat fine-tuning as essential.

%
GFMs significantly outperform supervised baselines in low-supervision regimes, and the choice of fine-tuning rate plays a relevant role in achieving the strongest performance on the selected dataset. Surprisingly, full fine-tuning is more effective than partial adaptation when labels are extremely scarce, contradicting common transfer learning assumptions about the quality of the pretrained encoder's features. Our results underscore the need to tailor fine-tuning strategies to data availability and task alignment, and to carefully tune FT-Rate as a hyperparameter in fine-tuning experiments.

%
By contextualizing our results within recent literature \citep{Marsocci2024pangea}, we provide clear recommendations: For large GFMs, tune FT-rate as a key hyperparameter, as no guarantee of alignment of the pre-training and downstream tasks was observed in our experiments; for experiments with sufficiently abundant labels, fully-supervised models with less computational cost were competitive or superior to larger pretrained models, so consider them as a relevant alternative; leverage long temporal sequences, particularly when supervision is limited, because even frozen GFMs substantially benefitted from richer temporal context.


\section*{Acknowledgements}

G.C., N.I., C.B.C. and V.B. have been funded by the grant Tecnologías Avanzadas TA24I10016 ANID \& G.C. by the grant National Center for Artificial Intelligence CENIA FB210017, Basal ANID.

\section*{Impact Statement}

This paper presents work whose goal is to advance the field of Machine Learning for Earth observation and land cover mapping. Our study contributes to the understanding of GFMs and fine-tuning strategies in label-scarce regimes, which is highly relevant in real-world scenarios where high-quality annotations are expensive or difficult to obtain. Potential societal benefits include improved land use monitoring, agricultural planning, and environmental conservation, particularly in under-resourced regions. We believe there are no foreseeable negative societal consequences or ethical concerns arising from this work.


\bibliography{JRC,COPLAC}
\bibliographystyle{icml2025}


\end{document}
