﻿<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "JATS-journalpublishing1.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">Explor Digit Health Technol</journal-id>
<journal-id journal-id-type="publisher-id">EDHT</journal-id>
<journal-title-group>
<journal-title>Exploration of Digital Health Technologies</journal-title>
</journal-title-group>
<issn pub-type="epub">2996-9409</issn>
<publisher>
<publisher-name>Open Exploration Publishing</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.37349/edht.2026.101194</article-id>
<article-id pub-id-type="manuscript">101194</article-id>
<article-categories>
<subj-group>
<subject>Original Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A comparative study of deep learning-based retinal image registration methods</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0007-9508-0611</contrib-id>
<name>
<surname>Dharmaseelan</surname>
<given-names>Thenuka</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing—original draft</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<role content-type="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="afn1">
<sup>†</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0002-8838-8072</contrib-id>
<name>
<surname>Sinha</surname>
<given-names>Neelabh</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role content-type="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing—original draft</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="afn1">
<sup>†</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0002-8657-6651</contrib-id>
<name>
<surname>Ashraf</surname>
<given-names>Samyyia</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing—original draft</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0009-0394-9944</contrib-id>
<name>
<surname>Daneshvar</surname>
<given-names>Kimia</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing—original draft</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<xref ref-type="aff" rid="I2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0009-8235-1763</contrib-id>
<name>
<surname>John</surname>
<given-names>Amit</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-1518-9788</contrib-id>
<name>
<surname>Giannakis</surname>
<given-names>Periklis</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="I3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0003-4893-0143</contrib-id>
<name>
<surname>Chan</surname>
<given-names>Yik Ting</given-names>
</name>
<role>Writing–review &amp; editing</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="I4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0004-4911-4233</contrib-id>
<name>
<surname>Chan</surname>
<given-names>Yiu Wai</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-1782-4711</contrib-id>
<name>
<surname>Pontikos</surname>
<given-names>Nikolas</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role content-type="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing—review &amp; editing</role>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<xref ref-type="aff" rid="I1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="cor1">
<sup>*</sup>
</xref>
</contrib>
<contrib contrib-type="editor">
<name>
<surname>Koulaouzidis</surname>
<given-names>Anastasios</given-names>
</name>
<role>Academic Editor</role>
<aff>University of Southern Denmark (SDU), Denmark</aff>
</contrib>
</contrib-group>
<aff id="I1">
<sup>1</sup>UCL Institute of Ophthalmology, EC1V 9EL London, United Kingdom</aff>
<aff id="I2">
<sup>2</sup>Translational Ophthalmology Research Center, Farabi Eye Hospital, Tehran University of Medical Sciences, Tehran 13366 16351, Iran</aff>
<aff id="I3">
<sup>3</sup>Faculty of Medicine and Dentistry, Queen Mary University of London, E1 2AD London, United Kingdom</aff>
<aff id="I4">
<sup>4</sup>Darent Valley Hospital, Dartford and Gravesham NHS Trust, DA2 8DA Dartford, United Kingdom</aff>
<author-notes>
<fn id="afn1" fn-type="equal">
<label>†</label>
<p>These authors share the first authorship.</p>
</fn>
<corresp id="cor1">
<bold>
<sup>*</sup>Correspondence:</bold> Nikolas Pontikos, UCL Institute of Ophthalmology, 11-43 Bath Street, EC1V 9EL London, United Kingdom. <email>n.pontikos@ucl.ac.uk</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<year>2026</year>
</pub-date>
<pub-date pub-type="epub">
<day>24</day>
<month>05</month>
<year>2026</year>
</pub-date>
<volume>4</volume>
<elocation-id>101194</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>18</day>
<month>03</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>© The Author(s) 2026.</copyright-statement>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This is an Open Access article licensed under a Creative Commons Attribution 4.0 International License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, sharing, adaptation, distribution and reproduction in any medium or format, for any purpose, even commercially, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Aim:</title>
<p id="absp-1">To benchmark three deep learning-based retinal image registration methods RetinaRegNet, EyeLiner, and GeoFormer on the Fundus Image Registration (FIRE) dataset to compare registration accuracy and computational efficiency using mean landmark error (MLE) as the primary outcome measure.</p>
</sec>
<sec>
<title>Methods:</title>
<p id="absp-2">The three image registration approaches were evaluated using the FIRE dataset under consistent conditions across varying image overlap conditions (Classes S, A, and P). These included: (a) RetinaRegNet, which incorporates diffusion features, dual keypoint sampling through Scale-Invariant Feature Transform (SIFT) and random, two-stage outlier removal, and a multilevel registration hierarchy progressing from homography to polynomial transforms; (b) EyeLiner, which integrates anatomical segmentation with SuperPoint feature extraction, LightGlue matching, and thin-plate spline warping; (c) GeoFormer, which builds on Local Feature Transformers (LoFTR) through cross-attention mechanisms and Random Sampling Consensus (RANSAC)-based refinement. Registration performance was quantified using MLE.</p>
</sec>
<sec>
<title>Results:</title>
<p id="absp-3">Across all 134 FIRE image pairs, RetinaRegNet achieved the lowest overall MLE (3.12 pixels), outperforming EyeLiner (3.81 pixels) and GeoFormer (6.06 pixels). Class-specific analysis showed that RetinaRegNet delivered the highest accuracy in Class S images (1.70 pixels), competitive performance in Class A (5.24 pixels), and the strongest results in the most challenging Class P cases (4.57 pixels). GeoFormer demonstrated the shortest processing time at 0.32 seconds per image pair, compared with 4.92 seconds for EyeLiner and 31.23 seconds for RetinaRegNet. In Class P, RetinaRegNet achieved a 59.2% improvement in accuracy relative to GeoFormer (4.57 vs 11.20 pixels). The code is available at: <uri xlink:href="https://github.com/ThenukaDharmaseelan/image_Registration">https://github.com/ThenukaDharmaseelan/image_Registration</uri>.</p>
</sec>
<sec>
<title>Conclusions:</title>
<p id="absp-4">Overall, the evaluation reveals a clear trade-off between registration precision and computational speed. RetinaRegNet achieves the lowest MLE for complex clinical cases despite higher computational cost. EyeLiner balances precision and speed for routine use, while GeoFormer prioritizes rapid throughput where processing speed is critical.</p>
</sec>
</abstract>
<kwd-group>
<kwd>retinal image registration</kwd>
<kwd>fundus photography</kwd>
<kwd>deep learning</kwd>
<kwd>EyeLiner</kwd>
<kwd>GeoFormer</kwd>
<kwd>RetinaRegNet</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p id="p-1">Retinal image registration is the process of aligning images across timepoints or across imaging modalities to make them comparable. This area of computer vision represents a cornerstone of modern ophthalmic research and clinical care, facilitating objective assessment of disease progression and evaluation of treatment efficacy [<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>]. Retinal imaging modalities include colour fundus photography, fundus autofluorescence, infrared reflectance, optical coherence tomography (OCT), fluorescein angiography, and scanning laser ophthalmoscopy, which capture complementary anatomical and functional information, essential for comprehensive disease assessment. These medical images are widely used in longitudinal studies to monitor disease progression and are therefore ideally suited for image alignment and comparative analysis. However, effective utilisation of retinal imaging data depends on robust registration frameworks capable of accurately aligning images despite substantial variations in resolution, contrast, illumination, and spectral characteristics across imaging modalities.</p>
<p id="p-2">The advent of deep learning precipitated a paradigm shift, initially utilising convolutional neural networks (CNNs) to predict alignment directly or via coarse-to-fine strategies [<xref ref-type="bibr" rid="B3">3</xref>–<xref ref-type="bibr" rid="B5">5</xref>]. As the field matured, research pivoted toward addressing specific limitations of early CNNs. Style-transfer frameworks were introduced to unify multimodal representations, enabling robust vessel segmentation without pixel-wise annotations [<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>]. To eliminate the dependency on ground-truth transformations, unsupervised deformable networks employing Spatial Transformer architectures were developed [<xref ref-type="bibr" rid="B8">8</xref>], while multi-scale frameworks incorporating edge similarity losses addressed optical distortions in fundus-OCT alignment [<xref ref-type="bibr" rid="B9">9</xref>]. Despite these sophisticated advances, the demand for greater robustness and interpretability has led to the divergence of three specialized architectural families (<xref ref-type="sec" rid="s-suppl">Figure S1</xref>). First, anatomically guided models began leveraging structural priors; early work explored structure-driven regression [<xref ref-type="bibr" rid="B10">10</xref>], while keypoint-based methods focused on explicit landmark matching [<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>]. Second, transformer-based architectures and detector-free frameworks were adopted to capture long-range spatial dependencies via attention mechanisms [<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>]. Finally, generative diffusion models, built on foundational probabilistic synthesis research, emerged to synthesize robust, illumination-invariant feature representations [<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>].</p>
<p id="p-3">However, a clear consensus on the optimal balance between computational efficiency and registration accuracy across these families remains elusive. To address this gap, we conduct a focused comparative analysis of three state-of-the-art frameworks on the FIRE dataset: RetinaRegNet, a zero-shot diffusion-based model leveraging hierarchical outlier suppression [<xref ref-type="bibr" rid="B17">17</xref>]; EyeLiner, an anatomically guided pipeline combining vessel landmark extraction with transformer matching [<xref ref-type="bibr" rid="B18">18</xref>]; and GeoFormer, a geometry-aware transformer incorporating Local Feature Transformer (LoFTR)-style correspondence detection with Random Sample Consensus (RANSAC) filtering for spatial consistency [<xref ref-type="bibr" rid="B19">19</xref>]. By evaluating these methods under varying degrees of overlap and anatomical distortion using an established classification system, this study provides a systematic comparative assessment of their accuracy, efficiency, and clinical suitability for large-scale retinal image analysis that has not been previously performed. A comprehensive comparison of the architectural approaches used by these three methods across key registration stages is provided in <xref ref-type="sec" rid="s-suppl">Table S1</xref>.</p>
</sec>
<sec id="s2">
<title>Materials and methods</title>
<sec id="t2-1">
<title>Dataset</title>
<p id="p-4">The FIRE dataset is widely recognized as a benchmark for evaluating retinal image registration algorithms. It comprises 134 colour fundus image pairs derived from 129 individual images, each captured at a resolution of 2,912 × 2,912 pixels with a 45° field of view using a NIDEK AFC-210 fundus camera [<xref ref-type="bibr" rid="B20">20</xref>]. To enable systematic performance assessment, the dataset is divided into three difficulty categories based on field-of-view overlap and anatomical variation: Class S (71 pairs) represents easy cases with substantial image overlap (&gt; 75%) without anatomical changes between images; Class A (14 pairs) consists of cases with &gt; 75% overlap that additionally exhibit anatomical changes as shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>, and Class P (49 pairs) comprises the most challenging cases with limited overlap (&lt; 75%) without anatomical differences between images. Each image pair includes 10 manually annotated ground-truth landmarks, typically located at vessel bifurcations and crossings, which provide a reference standard for quantitative evaluation of registration accuracy [<xref ref-type="bibr" rid="B20">20</xref>]. A visual overview of representative image pairs from each FIRE class is shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
<fig id="fig1" position="float">
<label>Figure 1</label>
<caption>
<p id="fig1-p-1">
<bold>Class A anatomical changes on the FIRE dataset. A</bold>: Reference retinal image showing normal fundus appearance; <bold>B</bold>: Test image with pathological features highlighted. Red circles indicate hard exudates; green circles denote cotton wool spots. Prominent optic disc swelling is observed in the test image.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g001.tif" />
</fig>
<fig id="fig2" position="float">
<label>Figure 2</label>
<caption>
<p id="fig2-p-1">
<bold>Pre-registration visual summary of the FIRE dataset.</bold> Columns show the fixed (reference) image, the moving (test) image, and their unregistered overlay. Ground-truth landmarks are shown as yellow (fixed) and blue (moving) points.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g002.tif" />
</fig>
</sec>
<sec id="t2-2">
<title>RetinaRegNet</title>
<p id="p-5">RetinaRegNet is a zero-shot framework designed to register pairs of retinal images by identifying correspondences within semantic diffusion features and subsequently warping one image onto the other. The process follows a broadly three-stage pipeline encompassing feature extraction, correspondence refinement, and hierarchical transformation (<xref ref-type="fig" rid="fig3">Figure 3</xref>).</p>
<fig id="fig3" position="float">
<label>Figure 3</label>
<caption>
<p id="fig3-p-1">
<bold>Overview of the RetinaRegNet registration pipeline.</bold> Moving and fixed fundus images are processed using diffusion-based feature extraction. Keypoints are detected with SIFT and RANSAC, correspondences established via cosine similarity, and outliers removed through two-stage filtering. Two-stage hierarchical registration applies homography for global alignment and third-order polynomial transformation for local alignment, producing the final registered output.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g003.tif" />
</fig>
<p id="p-6">First, rich feature maps are extracted from both the fixed and moving images using a pretrained Stable Diffusion model. Each image is passed through the model at a low noise step, producing intermediate feature tensors that capture both vessel patterns and broader retinal anatomy while remaining robust to variations in illumination and contrast.</p>
<p id="p-7">With these features in hand, the method selects a balanced set of control points, approximately half from SIFT (to capture textured vessel regions) and half sampled uniformly at random (to cover smooth areas), ensuring that correspondence estimation is not biased toward densely textured regions. It then computes correspondences by cosine similarity in feature space so that a point <italic>p</italic> in the fixed image is paired with the most similar location <italic>q</italic> in the moving image. This approach provides invariance to photometric variations and tolerates moderate geometric drift.</p>
<p id="p-8">Matched pairs are then refined through a two-step filtering process that eliminates outliers. First, a forward–backward (inverse consistency) check ensures that correspondences agree in both directions; mismatches that fail this test are discarded. Second, a geometric filtering step removes any pairs that deviate significantly from a globally consistent transformation. This combination of semantic matching and hierarchical outlier rejection produces a dense but reliable correspondence field that can support both global and local alignment.</p>
<p id="p-9">Finally, RetinaRegNet performs a coarse-to-fine warp in two stages. Stage 1 estimates a global homography to absorb overall eye or camera motion, followed by Stage 2, a smooth local polynomial warp to correct small residual deformations. In practice, this combination of semantic feature matching, rigorous yet efficient filtering, and sequential global-to-local warping achieves high registration accuracy, particularly in low-overlap scenarios. The diffusion features enable recognition of vascular and anatomical patterns even where traditional pixel-based methods fail, while the two-stage transformation mitigates overfitting and preserves anatomical coherence.</p>
</sec>
<sec id="t2-3">
<title>EyeLiner</title>
<p id="p-10">EyeLiner replicates clinical assessment patterns through modular anatomical structure analysis. The pipeline can be understood as a four-stage process that algorithmically mimics how clinicians track pathological changes relative to stable anatomical landmarks. An overview of the EyeLiner pipeline is shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>. The image pairs are first segmented to extract the retinal vasculature and optic disc. A standard U-Net architecture was used for training on established vessel segmentation datasets to generate vascular masks for both the fixed and moving images. For reproducibility, we relied on AutoMorph [<xref ref-type="bibr" rid="B21">21</xref>] instead, a publicly available deep learning pipeline for vessel segmentation. Optic disc segmentation is performed using MaskFormer [<xref ref-type="bibr" rid="B22">22</xref>], a transformer-based architecture for general-purpose image segmentation. The resulting disc mask can be used to filter out keypoints detected (in the following step) within the optic disc region, as vessels inside the disc are subject to biological motion and thus are less reliable for geometric alignment.</p>
<fig id="fig4" position="float">
<label>Figure 4</label>
<caption>
<p id="fig4-p-1">
<bold>Overview of the EyeLiner registration pipeline.</bold> Fixed and moving fundus images are first compared using a checkerboard overlay. Vessel and optic disc regions are segmented to guide correspondence estimation. Keypoints are detected with SuperPoint and matched using LightGlue along vascular structures. A Thin-Plate Spline model then warps the moving image, producing the registered output and improved checkerboard alignment.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g004.tif" />
</fig>
<p id="p-11">Once the vascular regions are defined, EyeLiner detects and matches distinctive vessel features in a unified process. The SuperPoint [<xref ref-type="bibr" rid="B23">23</xref>] network is used to identify salient keypoints along the segmented vessels and to compute descriptor vectors that capture the local appearance of each point. These descriptors are then passed directly to LightGlue [<xref ref-type="bibr" rid="B24">24</xref>], a lightweight transformer that performs feature matching. Rather than relying on direct numerical comparison of descriptor values, LightGlue interprets the geometric layout and contextual relationships among vessels in both images, producing more reliable correspondences even when illumination, scale, or focus differ between acquisitions. This combination of SuperPoint and LightGlue therefore establishes anatomically grounded, context-aware correspondences that are robust to the variations that typically challenge conventional intensity-based registration methods.</p>
<p id="p-12">The final stage of the EyeLiner pipeline performs the actual alignment by warping the moving image to match the fixed reference. Once correspondences are confirmed, a thin-plate spline transformation is used to model the deformation between the two images. This smooth, flexible transformation captures both global displacement and local non-rigid motion in the retinal vasculature while maintaining overall anatomical plausibility. In practice, the combination of anatomically guided segmentation, context-aware matching, and smooth deformation modelling enables EyeLiner to produce clinically interpretable and geometrically stable registrations, offering a strong balance between accuracy, computational efficiency, and biological realism.</p>
</sec>
<sec id="t2-4">
<title>Geoformer</title>
<p id="p-13">GeoFormer builds upon the existing LoFTR framework and employs a geometry-aware transformer to learn dense correspondences between retinal images without relying on explicit keypoint detectors. The method can be broadly divided into three stages. A schematic overview of the GeoFormer framework is presented in <xref ref-type="fig" rid="fig5">Figure 5</xref>. GeoFormer begins with a ResNet-FPN backbone [<xref ref-type="bibr" rid="B25">25</xref>] that extracts multiscale feature maps from the fixed and moving retinal images. The model captures coarse-level features at one-eighth resolution, encoding global anatomical information such as optic disc shape and overall curvature, and fine-level features at one-half resolution, which preserve local vessel structures, edge patterns, and texture details. This multi-level representation ensures that both global context and fine anatomical information are passed to the transformer network.</p>
<fig id="fig5" position="float">
<label>Figure 5</label>
<caption>
<p id="fig5-p-1">
<bold>GeoFormer architecture and methodology.</bold> Stage 1: ResNet-FPN extracts multi-scale features at 1/2 and 1/8 resolution. Stage 2: Transformer blocks perform self-attention and cross-attention to generate confidence maps, followed by geometry-aware RANSAC and refinement for sub-pixel accuracy. Stage 3: The final homography matrix is computed and applied to produce the registered output.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g005.tif" />
</fig>
<p id="p-14">The extracted features are processed through transformer blocks that perform self-attention within each image to capture contextual dependencies and cross-attention between the image pair to infer correspondences. This produces a confidence map that indicates how likely each feature in one image matches a feature in the other. The resulting matches are filtered using a geometry-aware RANSAC procedure, which removes correspondences inconsistent with a plausible geometric transformation and estimates an initial homography matrix. These verified matches are then refined through a second transformer stage that focuses attention only on local regions surrounding the geometrically consistent points, improving computational efficiency and accuracy in vessel-dense areas. Finally, the model revisits the fine-scale feature maps to achieve sub-pixel correspondence accuracy, accommodating vessel curvature and illumination variations that coarse features cannot fully capture.</p>
<p id="p-15">The refined correspondences are used to compute a final homography matrix that maps coordinates from the moving image onto the fixed image. The moving image is then warped according to this transformation, completing the registration process.</p>
</sec>
<sec id="t2-5">
<title>Evaluation metrics</title>
<p id="p-16">MLE in pixels for each evaluated method. MLE measures registration accuracy by calculating the average distance between the corresponding anatomical landmarks and their estimated locations obtained from the registration method, with lower values indicating better alignment:</p>
<p id="p-17">
<disp-formula id="eq1">
<label></label>
<mml:math id="meeda3">
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∑</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi> </mml:mi>
<mml:mo>=</mml:mo>
<mml:mi> </mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:msqrt>
<mml:msup>
<mml:mrow>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>'</mml:mi>
<mml:mi>'</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>-</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>'</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>'</mml:mi>
<mml:mi>'</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>-</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>'</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:msqrt>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p id="p-18">where (<italic>x</italic><sup>′</sup><italic><sub>i</sub> </italic><italic>,y<sub>i</sub></italic><sup>′</sup>) represent manually annotated ground truth landmarks and (<italic>x</italic><sup>′′</sup><italic><sub>i</sub> </italic><italic>,y<sub>i</sub></italic><sup>′′</sup>) are transformed coordinates.</p>
<p id="p-19">In addition to the mean landmark error, we evaluate registration performance using the success rate. A registration is considered successful if a transformed landmark lies within a predefined pixel threshold of its corresponding ground-truth location. For each image pair, this criterion is applied independently to the ten annotated landmarks, yielding a binary outcome (success or failure) per point. The success rate is then computed as the proportion of successful landmarks out of the ten ground-truth points for that image pair. The threshold τ is set to 12.5 pixels, following [<xref ref-type="bibr" rid="B17">17</xref>], to ensure comparability with prior results.</p>
<p id="p-20">
<disp-formula id="eq2">
<label></label>
<mml:math id="mae32c">
<mml:mi>S</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∑</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">Ι</mml:mi>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>≤</mml:mo>
<mml:mi>τ</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:math>
</disp-formula>
</p>
<p id="p-21">where <italic>I</italic>(⋅) is the indicator function, <inline-formula><mml:math id="m08d76"><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msqrt><mml:msup><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>'</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>'</mml:mi><mml:mi>'</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>'</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>'</mml:mi><mml:mi>'</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:msqrt></mml:math></inline-formula> and <italic>τ</italic> = 12.5 pixels.</p>
<p id="p-22">Finally, we quantify overall registration robustness using the Area Under the Curve (AUC) of the cumulative success-rate curve. For a range of pixel thresholds, the success rate is computed as the proportion of landmarks whose registration error falls below each threshold. The resulting curve characterises how registration performance degrades as the tolerance increases. The AUC provides a single scalar summary of this behaviour, with higher values indicating consistently higher success rates across a wide range of error thresholds:</p>
<p id="p-23">
<disp-formula id="eq3">
<label></label>
<mml:math id="m37f10">
<mml:mi>A</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∫</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>R</mml:mi>
<mml:mfenced separators="|">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:math>
</disp-formula>
</p>
<p id="p-24">where <italic>T<sub>max</sub></italic> is the maximum error threshold, set to 25 pixels for the FIRE dataset.</p>
<p id="p-25">We also evaluated registration results using Normalized Cross Correlation (NCC), an intensity-based metric that quantifies linear correlation between image intensities. In retinal imaging, NCC reflects global photometric consistency after warping but is influenced by acquisition-related factors and appearance changes due to disease progression. As NCC is computed over the full image domain, the metric does not explicitly encode pointwise anatomical correspondence. NCC is therefore reported as a complementary measure of global intensity agreement rather than a primary indicator of anatomical registration accuracy.</p>
<p id="p-26">
<disp-formula id="eq4">
<label></label>
<mml:math id="m8c7fc">
<mml:mi>N</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∑</mml:mo>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>μ</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>μ</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∑</mml:mo>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>μ</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mrow>
<mml:mi> </mml:mi>
<mml:mrow>
<mml:msubsup>
<mml:mo stretchy="false">∑</mml:mo>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi> </mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>μ</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
</disp-formula>
</p>
<p id="p-27">Where ∑ is 2-dimensional summation across (<italic>x</italic>, <italic>y</italic>), <italic>I<sub>f</sub></italic> (<italic>x</italic>,<italic>y</italic>) is the intensity of the fixed image at location (<italic>x</italic>, <italic>y</italic>) and <italic>I<sub>m</sub></italic> (<italic>x</italic>,<italic>y</italic>) is the intensity of the moving image at the location <italic>I<sub>m</sub></italic> (<italic>x</italic>,<italic>y</italic>), and <italic>μ<sub>f</sub></italic> and <italic>μ<sub>m</sub></italic> are the mean intensities of fixed and moving images, respectively.</p>
</sec>
<sec id="t2-6">
<title>Experimental setup</title>
<p id="p-28">All three registration pipelines were reproduced and evaluated in the same environment to ensure consistency and comparability across methods. All experiments were executed on a workstation equipped with an NVIDIA GeForce RTX 3090 GPU (24 GB VRAM) running CUDA 12.6. The implementations used Python 3.11.13, PyTorch 2.2.2, and torchvision 0.17.2 as the primary deep learning frameworks. For methods requiring vessel segmentation (EyeLiner), AutoMorph was used for reproducibility. All runtime and accuracy measurements reported in <xref ref-type="sec" rid="s3">Results</xref> section reflect executions under this standardised environment. The code for data processing is available at: <uri xlink:href="https://github.com/ThenukaDharmaseelan/image_Registration">https://github.com/ThenukaDharmaseelan/image_Registration</uri>.</p>
<p id="p-29">Exact parametric configurations used for each registration pipeline are provided in the <xref ref-type="sec" rid="s-suppl">Tables S2</xref>, <xref ref-type="sec" rid="s-suppl">3,</xref> <xref ref-type="sec" rid="s-suppl">4.</xref></p>
</sec>
</sec>
<sec id="s3">
<title>Results</title>
<sec id="t3-1">
<title>Overall performance analysis</title>
<p id="p-30">As shown in <xref ref-type="table" rid="t1">Table 1</xref> and <xref ref-type="fig" rid="fig6">Figure 6</xref>, RetinaRegNet achieved the lowest overall MLE of 3.12 (± 2.43 pixels) across the FIRE dataset. A Friedman test confirmed significant differences between the three methods (χ²(2) = 134.37, <italic>P</italic> = 2 × 10<sup>–16</sup>). Post-hoc Wilcoxon signed-rank tests with Bonferroni correction demonstrated that RetinaRegNet achieved significantly lower MLE than both EyeLiner (3.81 ± 3.13 pixels, <italic>P</italic> = 1 × 10<sup>–4</sup>) and GeoFormer (6.06 ± 4.86 pixels, <italic>P</italic> = 2 × 10<sup>–21</sup>), while EyeLiner also significantly outperformed GeoFormer (<xref ref-type="table" rid="t2">Table 2</xref>).</p>
<table-wrap id="t1">
<label>Table 1</label>
<caption>
<p id="t1-p-1">
<bold>Performance Comparison on FIRE Dataset using Mean Landmark Error (MLE), Success Rate (SR), Area Under Curve (AUC), and Normalized Cross Correlation (NCC).</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Method</bold>
</th>
<th>
<bold>Overall</bold>
<break />
<bold>(<italic>n</italic> = 134)</bold>
</th>
<th>
<bold>Class S</bold>
<break />
<bold>(<italic>n</italic> = 71)</bold>
</th>
<th>
<bold>Class A</bold>
<break />
<bold>(<italic>n</italic> = 14)</bold>
</th>
<th>
<bold>Class P</bold>
<break />
<bold>(<italic>n</italic> = 49)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td />
<td>
<bold>MLE</bold>
</td>
<td>
<bold>MLE</bold>
</td>
<td>
<bold>MLE</bold>
</td>
<td>
<bold>MLE</bold>
</td>
</tr>
<tr>
<td>RetinaRegNet</td>
<td>3.12 ± 2.43</td>
<td>1.70 ± 0.54</td>
<td>5.24 ± 2.64</td>
<td>4.57 ± 2.80</td>
</tr>
<tr>
<td>EyeLiner</td>
<td>3.81 ± 3.13</td>
<td>1.80 ± 0.40</td>
<td>4.87 ± 3.05</td>
<td>6.01 ± 3.75</td>
</tr>
<tr>
<td>GeoFormer</td>
<td>6.06 ± 4.86</td>
<td>2.42 ± 0.77</td>
<td>6.55 ± 4.71</td>
<td>11.20 ± 3.25</td>
</tr>
<tr>
<td />
<td>
<bold>SR</bold>
</td>
<td>
<bold>SR</bold>
</td>
<td>
<bold>SR</bold>
</td>
<td>
<bold>SR</bold>
</td>
</tr>
<tr>
<td>RetinaRegNet</td>
<td>97.76%</td>
<td>100%</td>
<td>92.86%</td>
<td>95.92%</td>
</tr>
<tr>
<td>EyeLiner</td>
<td>97.01%</td>
<td>100%</td>
<td>92.86%</td>
<td>93.88%</td>
</tr>
<tr>
<td>GeoFormer</td>
<td>88.06%</td>
<td>100%</td>
<td>92.86%</td>
<td>69.39%</td>
</tr>
<tr>
<td />
<td>
<bold>AUC</bold>
</td>
<td>
<bold>AUC</bold>
</td>
<td>
<bold>AUC</bold>
</td>
<td>
<bold>AUC</bold>
</td>
</tr>
<tr>
<td>RetinaRegNet</td>
<td>0.89</td>
<td>0.95</td>
<td>0.79</td>
<td>0.85</td>
</tr>
<tr>
<td>EyeLiner</td>
<td>0.85</td>
<td>0.93</td>
<td>0.80</td>
<td>0.76</td>
</tr>
<tr>
<td>GeoFormer</td>
<td>0.76</td>
<td>0.91</td>
<td>0.74</td>
<td>0.54</td>
</tr>
<tr>
<td />
<td>
<bold>NCC</bold>
</td>
<td>
<bold>NCC</bold>
</td>
<td>
<bold>NCC</bold>
</td>
<td>
<bold>NCC</bold>
</td>
</tr>
<tr>
<td>RetinaRegNet</td>
<td>0.56</td>
<td>0.75</td>
<td>0.60</td>
<td>0.29</td>
</tr>
<tr>
<td>EyeLiner</td>
<td>0.56</td>
<td>0.74</td>
<td>0.63</td>
<td>0.28</td>
</tr>
<tr>
<td>GeoFormer</td>
<td>0.64</td>
<td>0.72</td>
<td>0.62</td>
<td>0.53</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="fig6" position="float">
<label>Figure 6</label>
<caption>
<p id="fig6-p-1">
<bold>Overall comparison of registration methods across all classes on the FIRE dataset (<italic>n</italic> = 134).</bold> Box plots show the distribution of MLE in pixels for the three methods.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g006.tif" />
</fig>
<table-wrap id="t2">
<label>Table 2</label>
<caption>
<p id="t2-p-1">
<bold>Overall statistical comparison of MLE across all methods (<italic>n</italic> = 134).</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Comparison</bold>
</th>
<th>
<bold>Test</bold>
</th>
<th>
<bold>Statistic</bold>
</th>
<th>
<bold>Raw</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Bonferroni <italic>P</italic>-value</bold>
</th>
<th>
<bold>Mean difference</bold>
<break />
<bold>(A–B)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>Friedman</td>
<td>χ²(2)</td>
<td>134.37</td>
<td>2 × 10<sup>–16</sup></td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>RetinaRegNet vs EyeLiner</td>
<td>Wilcoxon V</td>
<td>2653</td>
<td>3 × 10<sup>–5</sup></td>
<td>1 × 10<sup>–4</sup> ***</td>
<td>−0.626</td>
</tr>
<tr>
<td>RetinaRegNet vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>198</td>
<td>8 × 10<sup>–22</sup></td>
<td>2 × 10<sup>–21</sup> ***</td>
<td>−3.050</td>
</tr>
<tr>
<td>EyeLiner vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>544</td>
<td>1 × 10<sup>–18</sup></td>
<td>3 × 10<sup>–18</sup> ***</td>
<td>−2.424</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p id="t2-fn-1">Asterisks denote Bonferroni-adjusted significance: * <italic>P</italic> &lt; 0.05, ** <italic>P</italic> &lt; 0.01, *** <italic>P</italic> &lt; 0.001.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p id="p-31">Box plot analysis in <xref ref-type="fig" rid="fig6">Figure 6</xref> reveals distinct distributional characteristics. RetinaRegNet demonstrates the most compact error distribution with a median of approximately 2.0 pixels and relatively few outliers. EyeLiner shows a slightly wider distribution with a median around 2.5 pixels and a comparable outlier pattern. In contrast, GeoFormer exhibits substantially higher variability, with a median of approximately 4 pixels and a significantly larger interquartile range.</p>
<p id="p-32">Success rate analysis (<xref ref-type="table" rid="t1">Table 1</xref>) showed RetinaRegNet achieved 97.76%, EyeLiner 97.01%, and GeoFormer 88.06% overall, demonstrating RetinaRegNet’s superior reliability in completing registration tasks successfully.</p>
<p id="p-33">AUC analysis (<xref ref-type="table" rid="t1">Table 1</xref>) confirmed RetinaRegNet’s superior performance with an overall AUC of 0.89, compared to EyeLiner (0.85) and GeoFormer (0.76), representing a 5% improvement over EyeLiner and 17% over GeoFormer.</p>
<p id="p-34">NCC analysis (<xref ref-type="table" rid="t1">Table 1</xref>) revealed contrasting patterns. GeoFormer achieved the highest overall NCC (0.64), followed by RetinaRegNet and EyeLiner (both 0.56). Notably, NCC rankings contradicted landmark-based metrics. Despite GeoFormer achieving the highest intensity correlation, it exhibited the poorest landmark accuracy (MLE: 6.06 pixels), lowest AUC (0.76), and poorest success rate (88.06%). These findings indicate that intensity-based similarity does not reliably reflect geometric alignment accuracy.</p>
<p id="p-35">Overall, RetinaRegNet demonstrates lower mean error, higher success rate, and superior AUC compared with EyeLiner and GeoFormer across all landmark-based metrics, while GeoFormer exhibits the widest error distribution despite achieving higher intensity-based correlation.</p>
<p id="p-36">Across difficulty classes, RetinaRegNet achieves superior overall performance (3.12 pixels), with the lowest MLE in Classes S and P and mean performance in Class A that is close to EyeLiner. EyeLiner attains the lowest mean MLE in Class A, although its advantage over RetinaRegNet is not statistically significant in this subset. Both methods consistently outperform GeoFormer, which shows particularly large errors in Class P. These findings underscore the importance of class specific evaluation in assessing the robustness and generalization capabilities of retinal registration methods.</p>
</sec>
<sec id="t3-2">
<title>Class specific performance analysis</title>
<sec id="t3-2-1">
<title>Class S performance</title>
<p id="p-37">Class S, representing the largest subset with 71 samples, proves to be the least challenging for all methods (<xref ref-type="fig" rid="fig7">Figure 7A</xref>). RetinaRegNet obtained the best performance with an MLE of (1.70 pixels), outperforming EyeLiner (1.80 pixels) and GeoFormer (2.42 pixels) by 6% and 42%, respectively. The consistently low error rates across all methods indicate that Class S images possess favourable characteristics for vessel landmark localisation. These differences were statistically significant, as shown in <xref ref-type="table" rid="t3">Table 3</xref> (Friedman <italic>P</italic> = 2 × 10<sup>–14</sup>; all pairwise comparisons reached significance after Bonferroni correction with <italic>P</italic> ≤ 8 × 10<sup>–4</sup>). AUC analysis (<xref ref-type="table" rid="t1">Table 1</xref>) demonstrated RetinaRegNet’s superior performance (0.95), followed by EyeLiner (0.93) and GeoFormer (0.91). All three methods achieved 100% success rates (<xref ref-type="table" rid="t1">Table 1</xref>), confirming the relative ease of Class S registration tasks. NCC values (<xref ref-type="table" rid="t1">Table 1</xref>) were comparable across methods (RetinaRegNet: 0.75, EyeLiner: 0.74, GeoFormer: 0.72), indicating similar intensity-based alignment in high-overlap scenarios. Overall, RetinaRegNet provides consistently lower landmark error and higher AUC than EyeLiner and GeoFormer in this class.</p>
<fig id="fig7" position="float">
<label>Figure 7</label>
<caption>
<p id="fig7-p-1">
<bold>Mean Landmark Error distribution by class and method on the FIRE dataset.</bold> Boxplots show the performance comparison of the three methods across the three classes. Diamond markers indicate mean values, horizontal lines show medians, boxes represent interquartile ranges (IQR), whiskers extend to 1.5 × IQR, and red circles represent outliers. Lower values indicate better registration accuracy.</p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="edht-04-101194-g007.tif" />
</fig>
<table-wrap id="t3">
<label>Table 3</label>
<caption>
<p id="t3-p-1">
<bold>Statistical comparison of MLE for Class S images (<italic>n</italic> = 71).</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Comparison</bold>
</th>
<th>
<bold>Test</bold>
</th>
<th>
<bold>Statistic</bold>
</th>
<th>
<bold>Raw</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Bonferroni</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Mean Difference</bold>
<break />
<bold>(A–B)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>Friedman</td>
<td>χ²(2)</td>
<td>63.577</td>
<td>2 × 10<sup>–14</sup></td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>RetinaRegNet vs EyeLiner</td>
<td>Wilcoxon V</td>
<td>640</td>
<td>2 × 10<sup>–4</sup></td>
<td>8 × 10<sup>–4</sup> ***</td>
<td>−0.204</td>
</tr>
<tr>
<td>RetinaRegNet vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>51</td>
<td>2 × 10<sup>–12</sup></td>
<td>6 × 10<sup>–12</sup> ***</td>
<td>−0.729</td>
</tr>
<tr>
<td>EyeLiner vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>339</td>
<td>8 × 10<sup>–8</sup></td>
<td>2 × 10<sup>–7</sup> ***</td>
<td>−0.524</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p id="t3-fn-1">Asterisks denote Bonferroni-adjusted significance: * <italic>P</italic> &lt; 0.05, ** <italic>P</italic> &lt; 0.01, *** <italic>P</italic> &lt; 0.001.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="t3-2-2">
<title>Class A performance</title>
<p id="p-38">Class A, representing moderate cases with anatomical change (14 samples), shows intermediate difficulty (<xref ref-type="fig" rid="fig7">Figure 7B</xref>). EyeLiner attained the lowest mean MLE in this class (4.87 pixels), followed closely by RetinaRegNet (5.24 pixels); however, this difference is not statistically significant, as shown in <xref ref-type="table" rid="t4">Table 4</xref> (adjusted <italic>P</italic> = 0.776). GeoFormer showed higher error at 6.55 pixels and performed significantly worse than EyeLiner (adjusted <italic>P</italic> = 0.012). AUC analysis (<xref ref-type="table" rid="t1">Table 1</xref>) showed EyeLiner achieved the highest performance (0.80), followed by RetinaRegNet (0.79) and GeoFormer (0.74). All three methods achieved identical 92.86% success rates (<xref ref-type="table" rid="t1">Table 1</xref>). NCC values (<xref ref-type="table" rid="t1">Table 1</xref>) showed minimal variation across methods (RetinaRegNet: 0.60, EyeLiner: 0.63, GeoFormer: 0.62), indicating comparable intensity-based similarity. These results indicate that performance differences between the methods are less pronounced in this subset, reflecting the moderate difficulty of Class A cases. However, these results should be interpreted with caution due to the limited sample size.</p>
<table-wrap id="t4">
<label>Table 4</label>
<caption>
<p id="t4-p-1">
<bold>Statistical comparison of MLE for Class A images (<italic>n</italic> = 14).</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Comparison</bold>
</th>
<th>
<bold>Test</bold>
</th>
<th>
<bold>Statistic</bold>
</th>
<th>
<bold>Raw</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Bonferroni</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Mean difference</bold>
<break />
<bold>(A–B)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>Friedman</td>
<td>χ²(2)</td>
<td>8.143</td>
<td>0.017</td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>RetinaRegNet vs EyeLiner</td>
<td>Wilcoxon V</td>
<td>71</td>
<td>0.259</td>
<td>0.776</td>
<td>0.340</td>
</tr>
<tr>
<td>RetinaRegNet vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>18</td>
<td>0.033</td>
<td>0.098</td>
<td>−1.147</td>
</tr>
<tr>
<td>EyeLiner vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>6</td>
<td>0.004</td>
<td>0.012*</td>
<td>−1.487</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p id="t4-fn-1">Asterisks denote Bonferroni-adjusted significance: * <italic>P</italic> &lt; 0.05, ** <italic>P</italic> &lt; 0.01, *** <italic>P</italic> &lt; 0.001.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="t3-2-3">
<title>Class P performance</title>
<p id="p-39">Performance degradation occurs across all methods on Class P samples (49 samples), the most challenging category (<xref ref-type="fig" rid="fig7">Figure 7C</xref>). RetinaRegNet maintains relatively strong performance with an MLE of 4.57 pixels, while EyeLiner records 6.01 pixels. GeoFormer experiences substantial difficulty in this class, with its error increasing to 11.20 pixels (145% increase compared to RetinaRegNet). This pronounced performance gap reveals that GeoFormer is particularly sensitive to the challenging characteristics present in low-overlap images. All pairwise differences in Class P were statistically significant after Bonferroni correction, as shown in <xref ref-type="table" rid="t5">Table 5</xref>, with RetinaRegNet significantly outperforming both EyeLiner (adjusted <italic>P</italic> = 6 × 10<sup>–4</sup>) and GeoFormer (adjusted <italic>P</italic> = 4 × 10<sup>–9</sup>), and EyeLiner also significantly outperforming GeoFormer (adjusted <italic>P</italic> = 4 × 10<sup>–8</sup>). AUC analysis (<xref ref-type="table" rid="t1">Table 1</xref>) demonstrated RetinaRegNet’s substantial advantage (0.85), compared to EyeLiner (0.76) and GeoFormer (0.54), representing a 57% relative improvement over GeoFormer. Success rate analysis (<xref ref-type="table" rid="t2">Table 2</xref>) showed RetinaRegNet achieved 95.92%, EyeLiner 93.88%, while GeoFormer dropped to 69.39%, indicating frequent registration failures in challenging low-overlap scenarios. Notably, NCC values (<xref ref-type="table" rid="t1">Table 1</xref>) revealed contrasting patterns: GeoFormer maintained the highest NCC (0.53) while RetinaRegNet and EyeLiner decreased to 0.29 and 0.28, respectively. This discordance between high intensity correlation and poor landmark accuracy in GeoFormer highlights that NCC does not reliably reflect geometric alignment quality in challenging registration scenarios.</p>
<table-wrap id="t5">
<label>Table 5</label>
<caption>
<p id="t5-p-1">
<bold>Statistical comparison of MLE for Class P images (<italic>n</italic> = 49).</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Comparison</bold>
</th>
<th>
<bold>Test</bold>
</th>
<th>
<bold>Statistic</bold>
</th>
<th>
<bold>Raw</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Bonferroni</bold>
<break />
<bold>
<italic>P</italic>-value</bold>
</th>
<th>
<bold>Mean difference (A–B)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>Friedman</td>
<td>χ²(2)</td>
<td>71.306</td>
<td>3 × 10<sup>–16</sup></td>
<td>–</td>
<td>–</td>
</tr>
<tr>
<td>RetinaRegNet vs EyeLiner</td>
<td>Wilcoxon V</td>
<td>238</td>
<td>2 × 10<sup>–4</sup></td>
<td>6 × 10<sup>–4</sup> ***</td>
<td>−1.513</td>
</tr>
<tr>
<td>RetinaRegNet vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>1</td>
<td>1 × 10<sup>–9</sup></td>
<td>4 × 10<sup>–9</sup> ***</td>
<td>−6.957</td>
</tr>
<tr>
<td>EyeLiner vs GeoFormer</td>
<td>Wilcoxon V</td>
<td>41</td>
<td>1 × 10<sup>–8</sup></td>
<td>4 × 10<sup>–8</sup> ***</td>
<td>−5.444</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p id="t5-fn-1">Asterisks denote Bonferroni-adjusted significance: * <italic>P</italic> &lt; 0.05, ** <italic>P</italic> &lt; 0.01, *** <italic>P</italic> &lt; 0.001.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="t3-3">
<title>Runtime analysis</title>
<p id="p-40">
<xref ref-type="table" rid="t6">Table 6</xref> presents the runtime performance analysis of the evaluated methods. GeoFormer demonstrates the fastest processing speed at 0.32 seconds per image, offering real-time processing capability. EyeLiner achieves moderate computational efficiency at 4.92 seconds per image. This includes 4.22 seconds for AutoMorph vessel segmentation and 0.70 seconds for the registration process itself, making it approximately 15 times slower than GeoFormer. RetinaRegNet exhibits the highest computational cost at 31.23 seconds per image, representing 98 times increase compared to GeoFormer and 6.4 times increase compared to EyeLiner.</p>
<table-wrap id="t6">
<label>Table 6</label>
<caption>
<p id="t6-p-1">
<bold>Runtime performance comparison.</bold>
</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>
<bold>Method</bold>
</th>
<th>
<bold>Time per image (seconds)</bold>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>RetinaRegNet</td>
<td>31.23</td>
</tr>
<tr>
<td>EyeLiner</td>
<td>4.92</td>
</tr>
<tr>
<td>GeoFormer</td>
<td>0.32</td>
</tr>
</tbody>
</table>
</table-wrap>
<p id="p-41">These computational differences highlight the trade-offs between different architectural approaches. EyeLiner’s reliance on explicit vessel segmentation via AutoMorph adds substantial preprocessing overhead compared to end-to-end approaches. Due to its complex architecture, RetinaRegNet’s multi-stage registration pipeline with iterative refinement steps results in considerably longer processing times. In contrast, GeoFormer’s transformer-based architecture enables direct processing of raw retinal images without intermediate segmentation steps, resulting in significantly faster inference times suitable for clinical deployment. The theoretical computational complexity and scalability characteristics of each pipeline across individual processing stages are analysed in <xref ref-type="sec" rid="s-suppl">Table S5</xref>.</p>
</sec>
</sec>
<sec id="s4">
<title>Discussion</title>
<p id="p-42">Our study highlights how the different design choices of the three methods influence their performance across the FIRE dataset. RetinaRegNet achieves the lowest overall MLE and consistently high success rate and AUC, indicating strong capability in aligning anatomical landmarks even under challenging imaging conditions but it takes the longest to complete registration. RetinaRegNet and EyeLiner performed similarly well in Class A, where the small sample size limits the ability to resolve subtle performance differences. In contrast, in Class P, which involves reduced overlap, RetinaRegNet demonstrates a clear advantage, indicating greater robustness under challenging alignment conditions. This was not the case for GeoFormer, whose reduced performance in difficult scenarios can largely be attributed to its architectural reliance on a global homography-only transformation, although insufficient fine-scale deformation modelling or dataset-specific effects may also contribute. EyeLiner produces competitive accuracy with relatively stable behaviour across the three difficulty classes at a moderate speed GeoFormer offers very fast processing but displays substantially higher error, particularly in cases with reduced overlap or anatomical distortion. Notably, this trend contrasts with intensity-based similarity measures such as NCC, where GeoFormer attains higher scores despite poorer landmark alignment, reinforcing that NCC reflects global photometric agreement rather than precise anatomical correspondence. These findings confirm that architectural choices, such as the use of local deformation models and anatomically guided correspondence strategies, play an important role in determining landmark-based registration accuracy and robustness.</p>
<p id="p-43">The observed accuracy differences have direct implications for clinical deployment of retinal registration pipelines. RetinaRegNet’s strong performance in challenging cases supports its use in longitudinal monitoring, where precise alignment is critical for detecting subtle structural change over time. EyeLiner offers a practical compromise for routine follow-up and vessel-centric analysis by balancing competitive accuracy in moderate-difficulty settings with lower computational cost and clear anatomical interpretability. In contrast, GeoFormer’s rapid inference makes it suitable for high-throughput applications such as screening or acquisition quality control, provided that difficult cases are identified and redirected to a more robust registration pipeline. Method selection therefore, depends on the required balance between registration precision, robustness to failure, and computational efficiency.</p>
<p id="p-44">Although RetinaRegNet exhibits the highest computational cost among the evaluated methods, this runtime is most relevant to offline longitudinal analysis rather than real-time clinical deployment. In many longitudinal retinal imaging workflows, such as disease progression studies or retrospective cohort analyses, robustness and alignment accuracy are prioritised over per-image latency, making runtimes on the order of tens of seconds acceptable. The current implementation does not support straightforward batching across image pairs due to its sequential correspondence filtering and transformation stages; improving throughput through architectural or implementation-level optimisation remains an avenue for future work.</p>
<p id="p-45">Our results confirm the findings of previous work [<xref ref-type="bibr" rid="B17">17</xref>], showing that incorporating local deformation, such as polynomial warping in RetinaRegNet and thin-plate spline warping in EyeLiner, improves landmark-based alignment accuracy, as reflected in the superior performance in both methods. Inverse consistency in RetinaRegNet helps mitigate inconsistent keypoint matches in regions where the underlying diffusion model exhibits low confidence. However, it may become less effective when the model is highly confident, which is, in practice, typically the case for well-pretrained models. Consequently, its contribution is less critical in this setting as EyeLiner is able to achieve comparable performance without imposing strong consistency constraints. Anatomy-guided filtering in EyeLiner encourages focus on vascular structure, but can be disadvantageous when vessel morphology changes drastically, leaving large regions without matched keypoints, as also noted in prior anatomy-guided approaches. RetinaRegNet alleviates this overemphasis on edges and high contrast regions through random keypoint sampling, promoting coverage across nearly the whole image. However, this strategy may mislead the warping algorithm in the presence of anatomical changes, resulting in slightly inferior performance in Class A.</p>
<p id="p-46">This study is limited by its reliance on the FIRE dataset, which contains only colour fundus photographs captured with a single 45-degree field-of-view camera at a fixed resolution. Results may differ for other imaging modalities such as autofluorescence, infrared, fluorescein angiography, or widefield imaging. Class A is relatively small in size (<italic>n</italic> = 14), which may limit statistical power to detect subtle performance differences between top-performing methods; future studies using larger, more balanced datasets are required to confirm these trends. Evaluation was based on ten manually annotated vascular landmarks per image pair, which capture alignment accuracy at a limited number of locations and may not fully reflect performance across the entire retina, particularly near lesions or in peripheral regions. Pixel-based errors were not converted into physical units, which limits direct clinical interpretation across devices with different magnifications or sensor characteristics. Finally, the lack of validation across multiple independent datasets limits the generalisability of the findings, and further studies on diverse datasets are required to address this limitation.</p>
<p id="p-47">This study presents an in-depth comparative evaluation of three deep learning-based retinal registration pipelines using the FIRE dataset. RetinaRegNet achieves the highest accuracy across the full dataset and in the most challenging low-overlap cases. EyeLiner provides reliable performance with moderate computational cost through anatomically guided matching and flexible warping. GeoFormer offers the fastest processing speed, but at the expense of reduced robustness in difficult scenarios. These findings underscore the value of combining global alignment with locally adaptive deformation when precise registration is required and highlight the importance of selecting a registration method that aligns with the clinical or research needs of the intended application. Future work should evaluate these approaches across diverse imaging modalities and explore optimisation strategies to improve the computational efficiency of high-accuracy models.</p>
</sec>
</body>
<back>
<glossary>
<title>Abbreviations</title>
<def-list>
<def-item>
<term>AUC</term>
<def>
<p>Area Under the Curve</p>
</def>
</def-item>
<def-item>
<term>CNNs</term>
<def>
<p>convolutional neural networks</p>
</def>
</def-item>
<def-item>
<term>IQR</term>
<def>
<p>Interquartile Range</p>
</def>
</def-item>
<def-item>
<term>LoFTR</term>
<def>
<p>Local Feature Transformer</p>
</def>
</def-item>
<def-item>
<term>MLE</term>
<def>
<p>mean landmark error</p>
</def>
</def-item>
<def-item>
<term>NCC</term>
<def>
<p>Normalized Cross Correlation</p>
</def>
</def-item>
<def-item>
<term>OCT</term>
<def>
<p>optical coherence tomography</p>
</def>
</def-item>
<def-item>
<term>RANSAC</term>
<def>
<p>Random Sample Consensus</p>
</def>
</def-item>
<def-item>
<term>SIFT</term>
<def>
<p>Scale-Invariant Feature Transform</p>
</def>
</def-item>
</def-list>
</glossary>
<sec id="s-suppl" sec-type="supplementary-material">
<title>Supplementary materials</title>
<p>The supplementary materials for this article are available at: <uri xlink:href="https://www.explorationpub.com/uploads/Article/file/101194_sup_1.pdf">https://www.explorationpub.com/uploads/Article/file/101194_sup_1.pdf</uri>.</p>
<supplementary-material id="SD1" content-type="local-data">
<media xlink:href="101194_sup_1.pdf" mimetype="application" mime-subtype="pdf"></media>
</supplementary-material>
</sec>
<sec id="s6">
<title>Declarations</title>
<sec id="t-6-1">
<title>Author contributions</title>
<p>TD: Conceptualization, Methodology, Data curation, Formal analysis, Project administration, Writing—original draft, Writing—review &amp; editing, Visualization, Writing—review &amp; editing, Funding acquisition. NS: Conceptualization, Investigation, Methodology, Formal analysis, Data curation, Visualization, Writing—original draft, Writing—review &amp; editing. YWC: Supervision, Conceptualization, Methodology, Validation, Writing—review &amp; editing, Funding acquisition. SA: Supervision, Writing—review &amp; editing, Writing—original draft. KD: Investigation, Writing—original draft, Writing—review &amp; editing. PG: Writing—review &amp; editing. YTC: Writing–review &amp; editing. AJ: Validation. NP: Supervision, Project administration, Validation, Writing—review &amp; editing, Funding acquisition. All authors read and approved the submitted version.</p>
</sec>
<sec id="t-6-2" sec-type="COI-statement">
<title>Conflicts of interest</title>
<p>NP is a patent holder of PCT/EP2023/076614 filed by UCL Business. The authors declare that they have no other relevant conflicts of interest.</p>
</sec>
<sec id="t-6-3">
<title>Ethical approval</title>
<p>This study utilised the FIRE dataset, which is publicly available, so it involves no new data collection or direct interaction with human participants. All data are fully anonymised. Therefore, ethical approval was not required.</p>
</sec>
<sec id="t-6-4">
<title>Consent to participate</title>
<p>This study involved secondary analysis of publicly available, anonymised retinal image data and did not involve direct participation of human subjects. Accordingly, consent to participate was not required.</p>
</sec>
<sec id="t-6-5">
<title>Consent to publication</title>
<p>No identifiable individual data are presented in this study. Therefore, consent to publication was not required.</p>
</sec>
<sec id="t-6-6" sec-type="data-availability">
<title>Availability of data and materials</title>
<p>The FIRE dataset analysed during this study is publicly available at: <uri xlink:href="https://projects.ics.forth.gr/cvrl/fire/">https://projects.ics.forth.gr/cvrl/fire/</uri> and was used in accordance with the terms of use. No additional datasets were generated or analysed. Code used for analysis is available from the corresponding author upon reasonable request.</p>
</sec>
<sec id="t-6-7">
<title>Funding</title>
<p>The research was supported by a grant from the National Institute for Health Research (NIHR) Biomedical Research Centre (BRC) at MEH NHS Foundation Trust and UCL Institute of Ophthalmology (grant no. NIHR203322). N.P. is funded by an Artificial Intelligence in Health and Care Award (NIHR AI Award grant no. AI_AWARD02488). The Artificial Intelligence in Health and Care Award is part of the NHS AI Laboratory, which has made funding available to accelerate the testing and evaluation of artificial intelligence technologies that meet the aims set out in the NHS Long Term Plan. The NHS AI Laboratory is a joint unit of teams from the Department of Health and Social Care and NHS England, driving forward the digital transformation of health and social care (<uri xlink:href="https://transform.england.nhs.uk/ai-lab/">https://transform.england.nhs.uk/ai-lab/</uri>). N.P. is also funded by Sight Research UK (grant no. TRN004). N.P. and Y.W.C are also funded by Medical Research Foundation and Moorfields Eye Charity (grant no. MRF-JF-EH-23-122) and Fight for Sight (grant no. RESSGA250). T.D. and N.P. are also funded by NIHR i4i THRIVE (grant no. NIHR505133). N.P. was also previously funded by Retina UK as part of the UK IRD Consortium, Moorfields Eye Charity Career Development Award (grant no. R190031A), HDRUK (grant no. MC_PC_18036) and by a Translational Innovation grant awarded by the UCL Translational Research Office, which has seed funded this work. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p>
</sec>
<sec id="t-6-8">
<title>Copyright</title>
<p>© The Author(s) 2026.</p>
</sec>
</sec>
<sec id="s7">
<title>Publisher’s note</title>
<p>Open Exploration maintains a neutral stance on jurisdictional claims in published institutional affiliations and maps. All opinions expressed in this article are the personal views of the author(s) and do not represent the stance of the editorial team or the publisher.</p>
</sec>
<ref-list>
<ref id="B1">
<label>1</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramsey</surname>
<given-names>DJ</given-names>
</name>
<name>
<surname>Sunness</surname>
<given-names>JS</given-names>
</name>
<name>
<surname>Malviya</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Applegate</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Hager</surname>
<given-names>GD</given-names>
</name>
<name>
<surname>Handa</surname>
<given-names>JT</given-names>
</name>
</person-group>
<article-title>Automated image alignment and segmentation to follow progression of geographic atrophy in age-related macular degeneration</article-title>
<source>Retina</source>
<year iso-8601-date="2014">2014</year>
<volume>34</volume>
<fpage>1296</fpage>
<lpage>307</lpage>
<pub-id pub-id-type="doi">10.1097/IAE.0000000000000069</pub-id>
<pub-id pub-id-type="pmid">24398699</pub-id>
</element-citation>
</ref>
<ref id="B2">
<label>2</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hussain</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Govindaiah</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Souied</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>RT</given-names>
</name>
<name>
<surname>Bhuiyan</surname>
<given-names>A</given-names>
</name>
</person-group>
<comment>Automated tracking and change detection for age-related macular degeneration progression using retinal fundus imaging. 2018 Joint 7th International Conference on Informatics, Electronics &amp; Vision (ICIEV) and 2018 2nd International Conference on Imaging, Vision &amp; Pattern Recognition (icIVPR); 2018 Jun 25–29; Kitakyushu, Japan. IEEE; pp. 394–8.</comment>
<pub-id pub-id-type="doi">10.1109/ICIEV.2018.8641078</pub-id>
</element-citation>
</ref>
<ref id="B3">
<label>3</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Balakrishnan</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Sabuncu</surname>
<given-names>MR</given-names>
</name>
<name>
<surname>Guttag</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Dalca</surname>
<given-names>AV</given-names>
</name>
</person-group>
<article-title>VoxelMorph: A Learning Framework for Deformable Medical Image Registration</article-title>
<source>IEEE Trans Med Imaging</source>
<year iso-8601-date="2019">2019</year>
<pub-id pub-id-type="doi">10.1109/TMI.2019.2897538</pub-id>
<pub-id pub-id-type="pmid">30716034</pub-id>
</element-citation>
</ref>
<ref id="B4">
<label>4</label>
<element-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Sokooti</surname>
<given-names>H</given-names>
</name>
<name>
<surname>de Vos</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Berendsen</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Lelieveldt</surname>
<given-names>BPF</given-names>
</name>
<name>
<surname>Išgum</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Staring</surname>
<given-names>M</given-names>
</name>
</person-group>
<article-title>Nonrigid Image Registration Using Multi-scale 3D Convolutional Neural Networks</article-title>
<person-group person-group-type="editor">
<name>
<surname>Descoteaux</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Maier-Hein</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Franz</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Jannin</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Collins</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Duchesne</surname>
<given-names>S</given-names>
</name>
</person-group>
<source>Medical Image Computing and Computer Assisted Intervention − MICCAI 2017</source>
<publisher-loc>MICCAI 2017. Cham</publisher-loc>
<publisher-name>Springer</publisher-name>
<year iso-8601-date="2017">2017</year>
<pub-id pub-id-type="doi">10.1007/978-3-319-66182-7_27</pub-id>
</element-citation>
</ref>
<ref id="B5">
<label>5</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Cavichini</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Bartsch</surname>
<given-names>DG</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>WR</given-names>
</name>
<etal>et al.</etal>
</person-group>
<article-title>Two-Step Registration on Multi-Modal Retinal Images via Deep Neural Networks</article-title>
<source>IEEE Trans Image Process</source>
<year iso-8601-date="2022">2022</year>
<volume>31</volume>
<fpage>823</fpage>
<lpage>38</lpage>
<pub-id pub-id-type="doi">10.1109/TIP.2021.3135708</pub-id>
<pub-id pub-id-type="pmid">34932479</pub-id>
<pub-id pub-id-type="pmcid">PMC8912939</pub-id>
</element-citation>
</ref>
<ref id="B6">
<label>6</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Bartsch</surname>
<given-names>DG</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>WR</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>TQ</given-names>
</name>
<name>
<surname>An</surname>
<given-names>C</given-names>
</name>
</person-group>
<article-title>Perspective Distortion Correction for Multi-Modal Registration between Ultra-Widefield and Narrow-Angle Retinal Images</article-title>
<source>Annu Int Conf IEEE Eng Med Biol Soc</source>
<year iso-8601-date="2021">2021</year>
<volume>2021</volume>
<fpage>4086</fpage>
<lpage>91</lpage>
<pub-id pub-id-type="doi">10.1109/EMBC46164.2021.9631084</pub-id>
<pub-id pub-id-type="pmid">34892126</pub-id>
<pub-id pub-id-type="pmcid">PMC9359414</pub-id>
</element-citation>
</ref>
<ref id="B7">
<label>7</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>An</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Amador</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Bartsch</surname>
<given-names>DU</given-names>
</name>
<name>
<surname>Borooah</surname>
<given-names>S</given-names>
</name>
</person-group>
<comment>Joint Vessel Segmentation and Deformable Registration on Multi-Modal Retinal Images Based on Style Transfer. 2019 IEEE International Conference on Image Processing (ICIP); 2019 Sep 22–25; Taipei, Taiwan. IEEE; 2019. pp. 839–43.</comment>
<pub-id pub-id-type="doi">10.1109/ICIP.2019.8802932</pub-id>
</element-citation>
</ref>
<ref id="B8">
<label>8</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Benvenuto</surname>
<given-names>GA</given-names>
</name>
<name>
<surname>Colnago</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Casaca</surname>
<given-names>W</given-names>
</name>
</person-group>
<comment>Unsupervised Deep Learning Network for Deformable Fundus Image Registration. ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2022 May 23–27; Singapore, Singapore. IEEE; 2022. pp. 1281–5.</comment>
<pub-id pub-id-type="doi">10.1109/ICASSP43922.2022.9747686</pub-id>
</element-citation>
</ref>
<ref id="B9">
<label>9</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Mou</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J</given-names>
</name>
<etal>et al.</etal>
</person-group>
<article-title>Multi-scale U-net with Edge Guidance for Multimodal Retinal Image Deformable Registration</article-title>
<source>Annu Int Conf IEEE Eng Med Biol Soc</source>
<year iso-8601-date="2020">2020</year>
<volume>2020</volume>
<fpage>1360</fpage>
<lpage>3</lpage>
<pub-id pub-id-type="doi">10.1109/EMBC44109.2020.9175613</pub-id>
<pub-id pub-id-type="pmid">33018241</pub-id>
</element-citation>
</ref>
<ref id="B10">
<label>10</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>B</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S</given-names>
</name>
</person-group>
<article-title>Non-rigid retinal image registration using an unsupervised structure-driven regression network</article-title>
<source>Neurocomputing</source>
<year iso-8601-date="2020">2020</year>
<volume>404</volume>
<fpage>14</fpage>
<lpage>25</lpage>
<pub-id pub-id-type="doi">10.1016/J.NEUCOM.2020.04.122</pub-id>
</element-citation>
</ref>
<ref id="B11">
<label>11</label>
<element-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>D</given-names>
</name>
</person-group>
<article-title>Semi-Supervised Keypoint Detector and Descriptor for Retinal Image Matching</article-title>
<person-group person-group-type="editor">
<name>
<surname>Avidan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Brostow</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Cissé</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Farinella</surname>
<given-names>GM</given-names>
</name>
<name>
<surname>Hassner</surname>
<given-names>T</given-names>
</name>
</person-group>
<source>Computer Vision – ECCV 2022</source>
<comment>ECCV 2022. Lecture Notes in Computer Science. 2022. pp. 593–609.</comment>
<pub-id pub-id-type="doi">10.1007/978-3-031-19803-8_35</pub-id>
</element-citation>
</ref>
<ref id="B12">
<label>12</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>AQ</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>EM</given-names>
</name>
<name>
<surname>Dalca</surname>
<given-names>AV</given-names>
</name>
<name>
<surname>Sabuncu</surname>
<given-names>MR</given-names>
</name>
</person-group>
<article-title>A robust and interpretable deep learning framework for multi-modal registration via keypoints</article-title>
<source>Med Image Anal</source>
<year iso-8601-date="2023">2023</year>
<volume>90</volume>
<elocation-id>102962</elocation-id>
<pub-id pub-id-type="doi">10.1016/j.media.2023.102962</pub-id>
<pub-id pub-id-type="pmid">37769550</pub-id>
<pub-id pub-id-type="pmcid">PMC10591968</pub-id>
</element-citation>
</ref>
<ref id="B13">
<label>13</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Frey</surname>
<given-names>EC</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Segars</surname>
<given-names>WP</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Y</given-names>
</name>
</person-group>
<article-title>TransMorph: Transformer for unsupervised medical image registration</article-title>
<source>Med Image Anal</source>
<year iso-8601-date="2022">2022</year>
<volume>82</volume>
<elocation-id>102615</elocation-id>
<pub-id pub-id-type="doi">10.1016/j.media.2022.102615</pub-id>
<pub-id pub-id-type="pmid">36156420</pub-id>
<pub-id pub-id-type="pmcid">PMC9999483</pub-id>
</element-citation>
</ref>
<ref id="B14">
<label>14</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Bao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X</given-names>
</name>
</person-group>
<comment>LoFTR: Detector-Free Local Feature Matching with Transformers. Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition; 2021 Jun 20–25; Nashville, USA. IEEE; 2021. pp. 8918–27.</comment>
<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00881</pub-id>
</element-citation>
</ref>
<ref id="B15">
<label>15</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ho</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P</given-names>
</name>
</person-group>
<article-title>Denoising Diffusion Probabilistic Models</article-title>
<source>Adv Neural Inf Process Syst</source>
<year iso-8601-date="2020">2020</year>
</element-citation>
</ref>
<ref id="B16">
<label>16</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rombach</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Blattmann</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Lorenz</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Esser</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Ommer</surname>
<given-names>B</given-names>
</name>
</person-group>
<comment>High-Resolution Image Synthesis with Latent Diffusion Models. Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition. 2022 Jun 18–24; New Orleans, USA. IEEE; 2022. pp. 10674–85.</comment>
<pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01042</pub-id>
</element-citation>
</ref>
<ref id="B17">
<label>17</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sivaraman</surname>
<given-names>VB</given-names>
</name>
<name>
<surname>Imran</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Muralidharan</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Tamplin</surname>
<given-names>MR</given-names>
</name>
<name>
<surname>Grumbach</surname>
<given-names>IM</given-names>
</name>
<etal>et al.</etal>
</person-group>
<article-title>RetinaRegNet: A zero-shot approach for retinal image registration</article-title>
<source>Comput Biol Med</source>
<year iso-8601-date="2025">2025</year>
<volume>186</volume>
<elocation-id>109645</elocation-id>
<pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109645</pub-id>
<pub-id pub-id-type="pmid">39813746</pub-id>
<pub-id pub-id-type="pmcid">PMC13063832</pub-id>
</element-citation>
</ref>
<ref id="B18">
<label>18</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Veturi</surname>
<given-names>YA</given-names>
</name>
<name>
<surname>McNamara</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kinder</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>CW</given-names>
</name>
<name>
<surname>Thakuria</surname>
<given-names>U</given-names>
</name>
<name>
<surname>Bearce</surname>
<given-names>B</given-names>
</name>
<etal>et al.</etal>
</person-group>
<article-title>EyeLiner: A Deep Learning Pipeline for Longitudinal Image Registration Using Fundus Landmarks</article-title>
<source>Ophthalmol Sci</source>
<year iso-8601-date="2024">2024</year>
<volume>5</volume>
<elocation-id>100664</elocation-id>
<pub-id pub-id-type="doi">10.1016/j.xops.2024.100664</pub-id>
<pub-id pub-id-type="pmid">39877463</pub-id>
<pub-id pub-id-type="pmcid">PMC11773051</pub-id>
</element-citation>
</ref>
<ref id="B19">
<label>19</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
</person-group>
<comment>Geometrized Transformer for Self-Supervised Homography Estimation. Proceedings of the IEEE International Conference on Computer Vision. 2023 October 1–6; Paris, France. IEEE; 2023. pp. 9522–31.</comment>
<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00876</pub-id>
</element-citation>
</ref>
<ref id="B20">
<label>20</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hernandez-Matas</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zabulis</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Triantafyllou</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Anyfanti</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Douma</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Argyros</surname>
<given-names>AA</given-names>
</name>
</person-group>
<article-title>FIRE: Fundus Image Registration dataset</article-title>
<source>Model Artif Intell Ophthalmol</source>
<year iso-8601-date="2017">2017</year>
<volume>1</volume>
<fpage>16</fpage>
<lpage>28</lpage>
<pub-id pub-id-type="doi">10.35119/MAIO.V1I4.42</pub-id>
</element-citation>
</ref>
<ref id="B21">
<label>21</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wagner</surname>
<given-names>SK</given-names>
</name>
<name>
<surname>Chia</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Woodward-Court</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>M</given-names>
</name>
<etal>et al.</etal>
</person-group>
<article-title>AutoMorph: Automated Retinal Vascular Morphology Quantification Via a Deep Learning Pipeline</article-title>
<source>Transl Vis Sci Technol</source>
<year iso-8601-date="2022">2022</year>
<volume>11</volume>
<elocation-id>12</elocation-id>
<pub-id pub-id-type="doi">10.1167/tvst.11.7.12</pub-id>
<pub-id pub-id-type="pmid">35833885</pub-id>
<pub-id pub-id-type="pmcid">PMC9290317</pub-id>
</element-citation>
</ref>
<ref id="B22">
<label>22</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Schwing</surname>
<given-names>AG</given-names>
</name>
<name>
<surname>Kirillov</surname>
<given-names>A</given-names>
</name>
</person-group>
<article-title>Per-Pixel Classification is Not All You Need for Semantic Segmentation</article-title>
<source>Adv Neural Inf Process Syst</source>
<year iso-8601-date="2021">2021</year>
<volume>22</volume>
<fpage>17864</fpage>
<lpage>75</lpage>
</element-citation>
</ref>
<ref id="B23">
<label>23</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Detone</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Malisiewicz</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Rabinovich</surname>
<given-names>A</given-names>
</name>
</person-group>
<comment>SuperPoint: Self-Supervised Interest Point Detection and Description. 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW); 2018 Jun 18–22; Salt Lake City, USA. IEEE; 2018. pp. 337–49.</comment>
<pub-id pub-id-type="doi">10.1109/CVPRW.2018.00060</pub-id>
</element-citation>
</ref>
<ref id="B24">
<label>24</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lindenberger</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Sarlin</surname>
<given-names>PE</given-names>
</name>
<name>
<surname>Pollefeys</surname>
<given-names>M</given-names>
</name>
</person-group>
<comment>LightGlue: Local Feature Matching at Light Speed. 2023 IEEE/CVF International Conference on Computer Vision (ICCV); 2023 Oct 1–6; Paris, France. IEEE; 2023. pp. 17581–92.</comment>
<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01616</pub-id>
</element-citation>
</ref>
<ref id="B25">
<label>25</label>
<element-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>TY</given-names>
</name>
<name>
<surname>Dollar</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Hariharan</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Belongie</surname>
<given-names>S</given-names>
</name>
</person-group>
<comment>Feature Pyramid Networks for Object Detection. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2017 Jul 21–26; Honolulu, HI, USA. IEEE; 2017. pp. 936–44.</comment>
<pub-id pub-id-type="doi">10.1109/CVPR.2017.106</pub-id>
</element-citation>
</ref>
</ref-list>
</back>
</article>