Optimizing Global Contrastive Loss with Automatic Temperature Individualization (iSogCLR)

Author: Zi-Hao Qiu, Xiyuan Wei, Zhuoning Yuan, Tianbao Yang

Introduction

In this tutorial, we introduce the application of iSogCLR algorithm in a typical bimodal contrastive learning task. In pretraining stage, we sample a subset of the widely used CC3M dataset, which contains about 3,000,000 image-text pairs. And then we evaluate the pretrained models via zero-shot image/text retrieval on MS-COCO dataset.

For the convenience of reproduction, we provide a subset of CC3M here, which contains 300,000 image text pairs. We also provide the MS-COCO dataset and its jsons files here. The experiment in this tutorial is conducted one 4 Nvidia 3090 GPUs, you can modify the CUDA_VISIBLE_DEVICES option and batch_size_train option based on your equipments.

References

If you find this tutorial helpful in your work, please cite our library paper and the following papers:

@inproceedings{qiu2023isogclr,
     title={Not All Semantics are Created Equal: Contrastive Self-supervised Learning with Automatic Temperature Individualization},
     author={Qiu, Zi-Hao and Hu, Quanqi and Yuan, Zhuoning and Zhou, Denny and Zhang, Lijun and Yang, Tianbao},
     booktitle={International Conference on Machine Learning},
     year={2023},
     organization={PMLR}
   }

Import required libs

!pip install -U libauc
!pip install timm
!pip install transformers

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7,8,9' # distributed training: '0,1,2,3'

import re
import argparse
from pathlib import Path
import json
import os
import random
import math
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch import optim
import torchvision
from torchvision import transforms

from torch.utils.data import Dataset, Subset, DataLoader

from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None

import cv2
import numpy as np

import timm
from transformers import AutoModel, AutoTokenizer

import open_clip
from open_clip.loss import ClipLoss


import libauc
from libauc.losses.contrastive import GCLoss_v2
from libauc.optimizers import iSogCLR
from libauc.utils.paper_utils import CosineLRScheduler

Arguments for experiments

# path to data folder
data_path = 'cc3m_subset'
train_file = 'cc3m_subset.json'

# model config
image_encoder = 'resnet50'
text_encoder = 'distilbert-base-uncased'
image_res = 256
vision_width = 768
embed_dim = 256
seed = 42

# optimizer and schedular
opt = 'adamW'
lr = 3e-4
min_lr = 1e-5
warmup = True
warmup_lr = 1e-5
weight_decay = 0.02
decay_rate = 1
epochs = 30
warmup_epochs = 20
cooldown_epochs = 0

# training & test settings
batch_size_train = 256
batch_size_test = 512
k_test = 256

# output path
output_dir = './output/'

# AMP training
use_amp = True

# loss config
temp = 0.01       # the temperature parameter for clip or sogclr
gamma = 0.8       # the parameter for the moving average estimator in sogclr/isogclr
rho = 8.0         # the rho parameter for isogclr
eta = 1e-4        # learning rate for the learnable temperature variables in isogclr
tau_init = 0.01   # the initial value of the learnable temperature variables in isogclr
beta_u = 0.9      # the momentum parameter for the graidents of the learnable temperature variables

n_gpus = torch.cuda.device_count()

val_coco_file = 'coco_val_new.json'
test_coco_file = 'coco_test_new.json'
coco_image_root = 'coco'

Path(output_dir).mkdir(parents=True, exist_ok=True)

Define helper functions

# we employ this function to preprocess the captions
def pre_caption(caption, max_words):
    caption = re.sub(
        r"([,.'!?\"()*#:;~])",
        '',
        caption.lower(),
    ).replace('-', ' ').replace('/', ' ').replace('<person>', 'person')

    caption = re.sub(
        r"\s{2,}",
        ' ',
        caption,
    )
    caption = caption.rstrip('\n')
    caption = caption.strip(' ')

    #truncate caption
    caption_words = caption.split(' ')
    if len(caption_words)>max_words:
        caption = ' '.join(caption_words[:max_words])

    return caption

class train_set(Dataset):
    def __init__(self, ann_file, transform, image_root, max_words=30):
        self.ann = []
        for f in ann_file:
            self.ann += json.load(open(f,'r'))
        self.transform = transform
        self.image_root = image_root
        self.max_words = max_words
        self.img_ids = {}

        n = 0
        for ann in self.ann:
            img_id = ann['image_id']
            if img_id not in self.img_ids.keys():
                self.img_ids[img_id] = n
                n += 1

    def __len__(self):
        return len(self.ann)

    def __getitem__(self, index):
        ann = self.ann[index]
        image_path = os.path.join(self.image_root, ann['image'])

        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        caption = pre_caption(ann['caption'], self.max_words)

        return image, caption, self.img_ids[ann['image_id']], index



class eval_set(Dataset):
    def __init__(self, ann_file, transform, image_root, max_words=30):
        self.ann = json.load(open(ann_file,'r'))
        self.transform = transform
        self.image_root = image_root
        self.max_words = max_words

        self.text = []
        self.image = []
        self.txt2img = {}
        self.img2txt = {}

        txt_id = 0
        for img_id, ann in enumerate(self.ann):
            self.image.append(ann['image'])
            self.img2txt[img_id] = []
            for i, caption in enumerate(ann['caption']):
                self.text.append(pre_caption(caption,self.max_words))
                self.img2txt[img_id].append(txt_id)
                self.txt2img[txt_id] = img_id
                txt_id += 1

    def __len__(self):
        return len(self.image)

    def __getitem__(self, index):
        image_path = os.path.join(self.image_root, self.ann[index]['image'])
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        return image, index

def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
    decay = []
    no_decay = []
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue  # frozen weights
        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
            no_decay.append(param)
        else:
            decay.append(param)
    return [
        {'params': no_decay, 'weight_decay': 0.},
        {'params': decay, 'weight_decay': weight_decay}]


def create_optimizer(model, opt, weight_decay=1e-5, filter_bias_and_bn=True):
    if weight_decay and filter_bias_and_bn:
        skip = {}
        if hasattr(model, 'no_weight_decay'):
            skip = model.no_weight_decay()
        parameters = add_weight_decay(model, weight_decay, skip)
        weight_decay = 0.
    else:
        parameters = model.parameters()

    opt_args = dict(lr=lr, weight_decay=weight_decay)
    optimizer = iSogCLR(parameters, mode=opt, **opt_args)

    return optimizer

def create_scheduler(optimizer):
    num_epochs = epochs

    lr_scheduler = CosineLRScheduler(
        optimizer,
        t_initial = num_epochs,
        t_mul = 1.0,
        lr_min = min_lr,
        decay_rate = decay_rate,
        warmup_lr_init = warmup_lr,
        warmup_t = warmup_epochs,
        cycle_limit = 1,
        t_in_epochs = True,
        noise_range_t = None,
        noise_pct = 0.67,
        noise_std = 1.0,
        noise_seed = 42,
    )

    return lr_scheduler

Reproducibility

The following functions limit the number of sources of randomness behaviors, such as model intialization, data shuffling, etcs.

# fix the seed for reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
cudnn.benchmark = True

Objectives

Here, we mainly introduce the Robust Global Contrastive Loss (RGCL) for learning representations for bimodal data (e.g., image-text data). For the detailed formulation, please refer to the paper,.

Building the model

# The following class includes the image encoder, text encoder and several objectives
class Model(nn.Module):
    def __init__(self, image_encoder = None, text_encoder = None,
                 embed_dim = 256, init_model = True, bsz = 128,
                 gamma = 0.9,         # the coefficient for moving average estimator
                 temp = 0.01,         # temperature for clip or sogclr
                 rho = 8.0, eta = 0.01, tau_init = 0.01, beta_u = 0.9,  # params for isogclr
                 use_temp_net = True):    # True if you want to use temperature network for isogclr
        super().__init__()

        self.temp = temp

        self.visual_encoder = timm.create_model(image_encoder, pretrained=init_model)
        self.visual_encoder.reset_classifier(0)

        self.text_encoder = AutoModel.from_pretrained(text_encoder, local_files_only=False)

        if not init_model:
            self.text_encoder.init_weights()

        self.vision_proj = nn.Linear(self.visual_encoder.num_features, embed_dim)
        self.text_proj = nn.Linear(768, embed_dim)

        self.criterion = GCLoss_v2(tau=temp, gamma=gamma, tau_min=0.005, tau_max=0.07,
                                   rho=rho, eta=eta, enable_isogclr=True)

    def forward(self, image, text_ids, text_att_masks, idx, text_idx, epoch):
        image_embeds = self.visual_encoder(image)
        image_embeds = self.vision_proj(image_embeds)
        image_feat = F.normalize(image_embeds, dim=-1)

        text_output = self.text_encoder(text_ids, attention_mask=text_att_masks, output_hidden_states=False)
        text_embeds = self.text_proj(text_output.last_hidden_state[:,0,:])
        text_feat = F.normalize(text_embeds, dim=-1)

        loss, info = self.criterion(image_feat, text_feat, idx)

        return loss, info

Training function

def epoch_train(model, data_loader, optimizer, tokenizer, epoch, max_epoch, warmup_steps, device, scheduler, grad_scaler):
    # train
    model.train()

    step_size = 100
    warmup_iterations = warmup_steps * step_size

    for i,(image, text, idx, text_idx) in enumerate(data_loader):
        optimizer.zero_grad()

        image = image.to(device, non_blocking=True)
        idx = idx.to(device, non_blocking=True)
        text_idx = text_idx.to(device, non_blocking=True)
        text_input = tokenizer(text, padding='max_length', truncation=True, max_length=30, return_tensors="pt").to(device)

        if grad_scaler is None:
            loss, info = model(image, text_input.input_ids, text_input.attention_mask, idx=idx, text_idx=text_idx, epoch=epoch)
            loss.mean().backward()
            optimizer.step()
        else:
            with torch.cuda.amp.autocast():
                loss, info = model(image, text_input.input_ids, text_input.attention_mask, idx=idx, text_idx=text_idx, epoch=epoch)
            grad_scaler.scale(loss.mean()).backward()
            grad_scaler.step(optimizer)
            grad_scaler.update()

        if epoch==0 and i%step_size==0 and i<=warmup_iterations:
            scheduler.step(i//step_size)

        if i%print_freq == 0:
            lr = optimizer.param_groups[0]["lr"]
            print("Epoch:", epoch, "iteration:", i, "lr:", lr, "loss:", loss.mean().item())
            if info is not None:
                print("tau_img: %.4f, tau_txt: %.4f" % (info[0].mean(), info[1].mean()))

Evaluation function

@torch.no_grad()
def evaluation(model, data_loader, tokenizer, device):
    # test
    model.eval()

    print('Computing features for evaluation...')
    texts = data_loader.dataset.text
    num_text = len(texts)
    text_bs = 256
    text_embeds = []
    for i in range(0, num_text, text_bs):
        text = texts[i: min(num_text, i+text_bs)]
        text_input = tokenizer(text, padding='max_length', truncation=True, max_length=30, return_tensors="pt").to(device)
        text_output = model.text_encoder(text_input.input_ids, attention_mask=text_input.attention_mask, output_hidden_states=False)
        text_embed = F.normalize(model.text_proj(text_output.last_hidden_state[:,0,:]), dim=-1)
        text_embeds.append(text_embed)
    text_embeds = torch.cat(text_embeds,dim=0)

    image_embeds = []
    for image, img_id in data_loader:
        image = image.to(device)
        image_feat = model.visual_encoder(image)
        image_embed = model.vision_proj(image_feat)
        image_embed = F.normalize(image_embed, dim=-1)
        image_embeds.append(image_embed)
    image_embeds = torch.cat(image_embeds,dim=0)

    sims_matrix = image_embeds @ text_embeds.t()
    score_matrix_i2t = torch.full((len(data_loader.dataset.image),len(texts)),-100.0).to(device)

    for i,sims in enumerate(sims_matrix):
        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
        score_matrix_i2t[i, topk_idx] = topk_sim

    sims_matrix = sims_matrix.t()
    score_matrix_t2i = torch.full((len(texts),len(data_loader.dataset.image)),-100.0).to(device)

    for i,sims in enumerate(sims_matrix):
        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
        score_matrix_t2i[i, topk_idx] = topk_sim

    return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()



@torch.no_grad()
def itm_eval(scores_i2t, scores_t2i, txt2img, img2txt):

    #Images->Text
    ranks = np.zeros(scores_i2t.shape[0])
    for index,score in enumerate(scores_i2t):
        inds = np.argsort(score)[::-1]
        # Score
        rank = 1e20
        for i in img2txt[index]:
            tmp = np.where(inds == i)[0][0]
            if tmp < rank:
                rank = tmp
        ranks[index] = rank

    # Compute metrics
    tr1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    tr5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    tr10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)

    #Text->Images
    ranks = np.zeros(scores_t2i.shape[0])

    for index,score in enumerate(scores_t2i):
        inds = np.argsort(score)[::-1]
        ranks[index] = np.where(inds == txt2img[index])[0][0]

    # Compute metrics
    ir1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    ir5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    ir10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)

    tr_mean = (tr1 + tr5 + tr10) / 3
    ir_mean = (ir1 + ir5 + ir10) / 3
    r_mean = (tr_mean + ir_mean) / 2

    eval_result =  {'txt_r1': tr1,
                    'txt_r5': tr5,
                    'txt_r10': tr10,
                    'txt_r_mean': tr_mean,
                    'img_r1': ir1,
                    'img_r5': ir5,
                    'img_r10': ir10,
                    'img_r_mean': ir_mean,
                    'r_mean': r_mean}
    return eval_result

Create datasets and dataloaders

# set up the transformation, datasets and dataloaders
train_transform = transforms.Compose([
        transforms.RandomResizedCrop(image_res, scale=(0.5, 1.0), interpolation=Image.BICUBIC),
        transforms.RandomHorizontalFlip(),
        transforms.RandAugment(),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

test_transform = transforms.Compose([
    transforms.Resize((image_res, image_res), interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

train_dataset = train_set([train_file], train_transform, data_path)
val_coco_dataset = eval_set(val_coco_file, test_transform, coco_image_root)
test_coco_dataset = eval_set(test_coco_file, test_transform, coco_image_root)

print("len of train_dataset:", len(train_dataset))
print("len of coco val/test:", len(val_coco_dataset), len(test_coco_dataset))

train_loader = DataLoader(train_dataset, batch_size=batch_size_train * n_gpus, num_workers=16, pin_memory=True,
                         shuffle=True, drop_last=True, prefetch_factor=4)
val_loader = DataLoader(val_coco_dataset, batch_size=batch_size_test, num_workers=16, pin_memory=True,
                       shuffle=False, drop_last=False, prefetch_factor=12)
test_loader = DataLoader(test_coco_dataset, batch_size=batch_size_test, num_workers=16, pin_memory=True,
                       shuffle=False, drop_last=False, prefetch_factor=12)

len of train_dataset: 300000
len of coco val/test: 5000 5000

Launch training and evaluation for iSogCLR

# create the model and wrap it in DDP
tokenizer = AutoTokenizer.from_pretrained(text_encoder, local_files_only=False)
model = Model(image_encoder=image_encoder, text_encoder=text_encoder, embed_dim=embed_dim,
              init_model=True, bsz=batch_size_train,
              gamma=gamma, temp=temp, rho=rho, eta=eta, tau_init=tau_init, beta_u=beta_u)

model = model.cuda()

if n_gpus > 1:
    print("Using", n_gpus, "GPUs")
    model = nn.DataParallel(model)

# set up the optimizer and objective function
optimizer = create_optimizer(model, opt, weight_decay)
lr_scheduler = create_scheduler(optimizer)

if use_amp:
    grad_scaler = torch.cuda.amp.GradScaler()
else:
    grad_scaler = None

# training loop
for epoch in range(0, epochs):
    train_stats = epoch_train(model, train_loader, optimizer, tokenizer, epoch, epochs,
                              warmup_epochs, torch.device('cuda'), lr_scheduler, grad_scaler)

    # evaluate the model on ms-coco data
    try:
        score_val_i2t_coco, score_val_t2i_coco = evaluation(model.module, val_loader, tokenizer,  torch.device('cuda'))
        score_test_i2t_coco, score_test_t2i_coco = evaluation(model.module, test_loader, tokenizer,  torch.device('cuda'))
    except:
        # for non-distributed training
        score_val_i2t_coco, score_val_t2i_coco = evaluation(model, val_loader, tokenizer,  torch.device('cuda'))
        score_test_i2t_coco, score_test_t2i_coco = evaluation(model, test_loader, tokenizer,  torch.device('cuda'))
    print("Epoch:", epoch)
    val_result_coco = itm_eval(score_val_i2t_coco, score_val_t2i_coco, val_loader.dataset.txt2img, val_loader.dataset.img2txt)
    print("coco val:", val_result_coco)
    test_result_coco = itm_eval(score_test_i2t_coco, score_test_t2i_coco, test_loader.dataset.txt2img, test_loader.dataset.img2txt)
    print("coco test:", test_result_coco)

    lr_scheduler.step(epoch+warmup_epochs+1)

Epoch: 0 iteration: 0 lr: 1e-05 loss: 24.701007843017578
tau_img: 0.0100, tau_txt: 0.0100
Epoch: 0 iteration: 50 lr: 1e-05 loss: 10.574981689453125
tau_img: 0.0100, tau_txt: 0.0100
Epoch: 0 iteration: 100 lr: 2.45e-05 loss: 4.697925567626953
tau_img: 0.0100, tau_txt: 0.0100
Epoch: 0 iteration: 150 lr: 2.45e-05 loss: 1.9576847553253174
tau_img: 0.0100, tau_txt: 0.0100
Epoch: 0 iteration: 200 lr: 3.899999999999999e-05 loss: 1.0460829734802246
tau_img: 0.0100, tau_txt: 0.0100
Epoch: 0 iteration: 250 lr: 3.899999999999999e-05 loss: 0.5043810606002808
tau_img: 0.0100, tau_txt: 0.0100
Computing features for evaluation...
Computing features for evaluation...
Epoch: 0
coco val: {'txt_r1': 4.1, 'txt_r5': 13.8, 'txt_r10': 21.34, 'txt_r_mean': 13.079999999999998, 'img_r1': 2.0591763294682126, 'img_r5': 7.860855657736905, 'img_r10': 13.13874450219912, 'img_r_mean': 7.686258829801413, 'r_mean': 10.383129414900706}
coco test: {'txt_r1': 4.2, 'txt_r5': 12.7, 'txt_r10': 20.2, 'txt_r_mean': 12.366666666666665, 'img_r1': 1.9832067173130747, 'img_r5': 7.493002798880448, 'img_r10': 12.950819672131148, 'img_r_mean': 7.4756763961082235, 'r_mean': 9.921171531387444}
Epoch: 1 iteration: 0 lr: 0.0002992056748283996 loss: 1.3195196390151978
tau_img: 0.0094, tau_txt: 0.0095
Epoch: 1 iteration: 50 lr: 0.0002992056748283996 loss: 0.075884610414505
tau_img: 0.0094, tau_txt: 0.0095
Epoch: 1 iteration: 100 lr: 0.0002992056748283996 loss: 0.3162369430065155
tau_img: 0.0094, tau_txt: 0.0095
Epoch: 1 iteration: 150 lr: 0.0002992056748283996 loss: 0.1882624328136444
tau_img: 0.0094, tau_txt: 0.0095
Epoch: 1 iteration: 200 lr: 0.0002992056748283996 loss: -0.10296255350112915
tau_img: 0.0094, tau_txt: 0.0095
Epoch: 1 iteration: 250 lr: 0.0002992056748283996 loss: 0.15444990992546082
tau_img: 0.0094, tau_txt: 0.0095
Computing features for evaluation...
Computing features for evaluation...
Epoch: 1
coco val: {'txt_r1': 12.22, 'txt_r5': 28.74, 'txt_r10': 40.32, 'txt_r_mean': 27.093333333333334, 'img_r1': 5.881647341063575, 'img_r5': 18.10075969612155, 'img_r10': 27.608956417433028, 'img_r_mean': 17.197121151539385, 'r_mean': 22.14522724243636}
coco test: {'txt_r1': 11.34, 'txt_r5': 29.4, 'txt_r10': 40.32, 'txt_r_mean': 27.02, 'img_r1': 5.593762495001999, 'img_r5': 18.376649340263896, 'img_r10': 27.984806077568972, 'img_r_mean': 17.318405970944955, 'r_mean': 22.169202985472477}
Epoch: 2 iteration: 0 lr: 0.0002968314021064018 loss: -0.0604383647441864
tau_img: 0.0088, tau_txt: 0.0088
Epoch: 2 iteration: 50 lr: 0.0002968314021064018 loss: 0.23243539035320282
tau_img: 0.0088, tau_txt: 0.0088
Epoch: 2 iteration: 100 lr: 0.0002968314021064018 loss: 0.04821205139160156
tau_img: 0.0088, tau_txt: 0.0088
Epoch: 2 iteration: 150 lr: 0.0002968314021064018 loss: 0.21965868771076202
tau_img: 0.0088, tau_txt: 0.0088
Epoch: 2 iteration: 200 lr: 0.0002968314021064018 loss: 0.05134771019220352
tau_img: 0.0088, tau_txt: 0.0088
Epoch: 2 iteration: 250 lr: 0.0002968314021064018 loss: 0.1536252200603485
tau_img: 0.0088, tau_txt: 0.0088
Computing features for evaluation...
Computing features for evaluation...
Epoch: 2
coco val: {'txt_r1': 14.64, 'txt_r5': 35.0, 'txt_r10': 46.5, 'txt_r_mean': 32.04666666666667, 'img_r1': 7.97281087564974, 'img_r5': 22.898840463814473, 'img_r10': 33.77049180327869, 'img_r_mean': 21.547381047580966, 'r_mean': 26.79702385712382}
coco test: {'txt_r1': 15.14, 'txt_r5': 34.42, 'txt_r10': 46.54, 'txt_r_mean': 32.03333333333333, 'img_r1': 8.388644542183126, 'img_r5': 23.594562175129948, 'img_r10': 34.406237504998, 'img_r_mean': 22.12981474077036, 'r_mean': 27.081574037051844}
Epoch: 3 iteration: 0 lr: 0.00029290319486279724 loss: -0.29481595754623413
tau_img: 0.0083, tau_txt: 0.0081
Epoch: 3 iteration: 50 lr: 0.00029290319486279724 loss: 0.06638230383396149
tau_img: 0.0083, tau_txt: 0.0081
Epoch: 3 iteration: 100 lr: 0.00029290319486279724 loss: 0.03567551076412201
tau_img: 0.0083, tau_txt: 0.0082
Epoch: 3 iteration: 150 lr: 0.00029290319486279724 loss: 0.05767179653048515
tau_img: 0.0083, tau_txt: 0.0081
Epoch: 3 iteration: 200 lr: 0.00029290319486279724 loss: 0.056682661175727844
tau_img: 0.0083, tau_txt: 0.0082
Epoch: 3 iteration: 250 lr: 0.00029290319486279724 loss: 0.28257113695144653
tau_img: 0.0083, tau_txt: 0.0082
Computing features for evaluation...
Computing features for evaluation...
Epoch: 3
coco val: {'txt_r1': 15.9, 'txt_r5': 37.2, 'txt_r10': 49.18, 'txt_r_mean': 34.093333333333334, 'img_r1': 9.70811675329868, 'img_r5': 26.3734506197521, 'img_r10': 37.31707317073171, 'img_r_mean': 24.466213514594163, 'r_mean': 29.279773423963746}
coco test: {'txt_r1': 15.52, 'txt_r5': 37.28, 'txt_r10': 48.94, 'txt_r_mean': 33.913333333333334, 'img_r1': 9.660135945621752, 'img_r5': 26.66533386645342, 'img_r10': 37.49300279888045, 'img_r_mean': 24.606157536985204, 'r_mean': 29.259745435159267}
Epoch: 4 iteration: 0 lr: 0.00028746409135817707 loss: -0.2583860158920288
tau_img: 0.0079, tau_txt: 0.0077
Epoch: 4 iteration: 50 lr: 0.00028746409135817707 loss: 0.04029808193445206
tau_img: 0.0079, tau_txt: 0.0076
Epoch: 4 iteration: 100 lr: 0.00028746409135817707 loss: 0.11739009618759155
tau_img: 0.0079, tau_txt: 0.0076
Epoch: 4 iteration: 150 lr: 0.00028746409135817707 loss: 0.32731348276138306
tau_img: 0.0079, tau_txt: 0.0076
Epoch: 4 iteration: 200 lr: 0.00028746409135817707 loss: -0.00629810243844986
tau_img: 0.0079, tau_txt: 0.0076
Epoch: 4 iteration: 250 lr: 0.00028746409135817707 loss: 0.15173837542533875
tau_img: 0.0079, tau_txt: 0.0076
Computing features for evaluation...
Computing features for evaluation...
Epoch: 4
coco val: {'txt_r1': 17.16, 'txt_r5': 38.44, 'txt_r10': 50.34, 'txt_r_mean': 35.31333333333333, 'img_r1': 10.903638544582167, 'img_r5': 27.86485405837665, 'img_r10': 39.40423830467813, 'img_r_mean': 26.057576969212317, 'r_mean': 30.685455151272826}
coco test: {'txt_r1': 17.0, 'txt_r5': 37.84, 'txt_r10': 50.16, 'txt_r_mean': 35.0, 'img_r1': 10.415833666533386, 'img_r5': 28.58856457417033, 'img_r10': 40.26789284286286, 'img_r_mean': 26.424097027855524, 'r_mean': 30.712048513927762}
Epoch: 5 iteration: 0 lr: 0.0002805736835487436 loss: -0.4848897457122803
tau_img: 0.0075, tau_txt: 0.0072
Epoch: 5 iteration: 50 lr: 0.0002805736835487436 loss: 0.06531377136707306
tau_img: 0.0075, tau_txt: 0.0072
Epoch: 5 iteration: 100 lr: 0.0002805736835487436 loss: 0.09321524202823639
tau_img: 0.0075, tau_txt: 0.0072
Epoch: 5 iteration: 150 lr: 0.0002805736835487436 loss: 0.218039870262146
tau_img: 0.0075, tau_txt: 0.0073
Epoch: 5 iteration: 200 lr: 0.0002805736835487436 loss: 0.1558637171983719
tau_img: 0.0075, tau_txt: 0.0072
Epoch: 5 iteration: 250 lr: 0.0002805736835487436 loss: -0.09588228911161423
tau_img: 0.0075, tau_txt: 0.0072
Computing features for evaluation...
Computing features for evaluation...
Epoch: 5
coco val: {'txt_r1': 18.54, 'txt_r5': 40.0, 'txt_r10': 51.6, 'txt_r_mean': 36.71333333333333, 'img_r1': 11.015593762495001, 'img_r5': 28.984406237505, 'img_r10': 40.42782886845262, 'img_r_mean': 26.809276289484206, 'r_mean': 31.76130481140877}
coco test: {'txt_r1': 16.56, 'txt_r5': 38.8, 'txt_r10': 51.22, 'txt_r_mean': 35.526666666666664, 'img_r1': 11.107556977209116, 'img_r5': 29.072371051579367, 'img_r10': 40.77169132347061, 'img_r_mean': 26.983873117419694, 'r_mean': 31.25526989204318}
Epoch: 6 iteration: 0 lr: 0.0002723074641843674 loss: -0.5769622325897217
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 6 iteration: 50 lr: 0.0002723074641843674 loss: 0.37227633595466614
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 6 iteration: 100 lr: 0.0002723074641843674 loss: 0.06294765323400497
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 6 iteration: 150 lr: 0.0002723074641843674 loss: -0.028086403384804726
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 6 iteration: 200 lr: 0.0002723074641843674 loss: 0.08182275295257568
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 6 iteration: 250 lr: 0.0002723074641843674 loss: 0.16375750303268433
tau_img: 0.0072, tau_txt: 0.0069
Computing features for evaluation...
Computing features for evaluation...
Epoch: 6
coco val: {'txt_r1': 18.02, 'txt_r5': 40.82, 'txt_r10': 53.12, 'txt_r_mean': 37.32, 'img_r1': 11.431427429028389, 'img_r5': 29.748100759696122, 'img_r10': 41.47940823670532, 'img_r_mean': 27.55297880847661, 'r_mean': 32.4364894042383}
coco test: {'txt_r1': 17.68, 'txt_r5': 40.18, 'txt_r10': 52.56, 'txt_r_mean': 36.806666666666665, 'img_r1': 11.75529788084766, 'img_r5': 30.151939224310276, 'img_r10': 41.89924030387845, 'img_r_mean': 27.935492469678792, 'r_mean': 32.37107956817273}
Epoch: 7 iteration: 0 lr: 0.00026275599969422214 loss: -0.4518427550792694
tau_img: 0.0070, tau_txt: 0.0067
Epoch: 7 iteration: 50 lr: 0.00026275599969422214 loss: 0.2819710075855255
tau_img: 0.0070, tau_txt: 0.0067
Epoch: 7 iteration: 100 lr: 0.00026275599969422214 loss: 0.05290326103568077
tau_img: 0.0070, tau_txt: 0.0067
Epoch: 7 iteration: 150 lr: 0.00026275599969422214 loss: -0.008920110762119293
tau_img: 0.0070, tau_txt: 0.0067
Epoch: 7 iteration: 200 lr: 0.00026275599969422214 loss: 0.2930781841278076
tau_img: 0.0070, tau_txt: 0.0067
Epoch: 7 iteration: 250 lr: 0.00026275599969422214 loss: 0.14736725389957428
tau_img: 0.0070, tau_txt: 0.0067
Computing features for evaluation...
Computing features for evaluation...
Epoch: 7
coco val: {'txt_r1': 17.88, 'txt_r5': 40.54, 'txt_r10': 52.78, 'txt_r_mean': 37.06666666666667, 'img_r1': 11.571371451419433, 'img_r5': 30.023990403838464, 'img_r10': 41.543382646941225, 'img_r_mean': 27.71291483406638, 'r_mean': 32.38979075036652}
coco test: {'txt_r1': 18.14, 'txt_r5': 39.58, 'txt_r10': 51.58, 'txt_r_mean': 36.43333333333333, 'img_r1': 12.167133146741303, 'img_r5': 30.851659336265495, 'img_r10': 42.4390243902439, 'img_r_mean': 28.485938957750232, 'r_mean': 32.45963614554178}
Epoch: 8 iteration: 0 lr: 0.0002520239379220344 loss: -0.36706972122192383
tau_img: 0.0068, tau_txt: 0.0065
Epoch: 8 iteration: 50 lr: 0.0002520239379220344 loss: -0.229108527302742
tau_img: 0.0068, tau_txt: 0.0065
Epoch: 8 iteration: 100 lr: 0.0002520239379220344 loss: 0.31043940782546997
tau_img: 0.0068, tau_txt: 0.0065
Epoch: 8 iteration: 150 lr: 0.0002520239379220344 loss: 0.00404047966003418
tau_img: 0.0069, tau_txt: 0.0066
Epoch: 8 iteration: 200 lr: 0.0002520239379220344 loss: -0.24809685349464417
tau_img: 0.0069, tau_txt: 0.0066
Epoch: 8 iteration: 250 lr: 0.0002520239379220344 loss: -0.2770186960697174
tau_img: 0.0068, tau_txt: 0.0065
Computing features for evaluation...
Computing features for evaluation...
Epoch: 8
coco val: {'txt_r1': 16.92, 'txt_r5': 38.66, 'txt_r10': 51.2, 'txt_r_mean': 35.593333333333334, 'img_r1': 11.38344662135146, 'img_r5': 29.760095961615352, 'img_r10': 41.63934426229508, 'img_r_mean': 27.5942956150873, 'r_mean': 31.59381447421032}
coco test: {'txt_r1': 17.36, 'txt_r5': 38.22, 'txt_r10': 50.44, 'txt_r_mean': 35.339999999999996, 'img_r1': 11.82327069172331, 'img_r5': 30.3718512594962, 'img_r10': 41.74330267892843, 'img_r_mean': 27.979474876715983, 'r_mean': 31.65973743835799}
Epoch: 9 iteration: 0 lr: 0.00024022886158240857 loss: -0.7354167699813843
tau_img: 0.0067, tau_txt: 0.0064
Epoch: 9 iteration: 50 lr: 0.00024022886158240857 loss: -0.14618906378746033
tau_img: 0.0067, tau_txt: 0.0064
Epoch: 9 iteration: 100 lr: 0.00024022886158240857 loss: 0.12334905564785004
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 9 iteration: 150 lr: 0.00024022886158240857 loss: -0.45143190026283264
tau_img: 0.0067, tau_txt: 0.0065
Epoch: 9 iteration: 200 lr: 0.00024022886158240857 loss: 0.06901969015598297
tau_img: 0.0067, tau_txt: 0.0065
Epoch: 9 iteration: 250 lr: 0.00024022886158240857 loss: 0.02915862947702408
tau_img: 0.0067, tau_txt: 0.0064
Computing features for evaluation...
Computing features for evaluation...
Epoch: 9
coco val: {'txt_r1': 17.24, 'txt_r5': 39.68, 'txt_r10': 52.52, 'txt_r_mean': 36.48, 'img_r1': 11.943222710915634, 'img_r5': 30.279888044782087, 'img_r10': 42.059176329468215, 'img_r_mean': 28.094095695055312, 'r_mean': 32.28704784752765}
coco test: {'txt_r1': 17.64, 'txt_r5': 39.44, 'txt_r10': 50.9, 'txt_r_mean': 35.99333333333333, 'img_r1': 11.975209916033586, 'img_r5': 30.463814474210317, 'img_r10': 41.97920831667333, 'img_r_mean': 28.13941090230574, 'r_mean': 32.06637211781954}
Epoch: 10 iteration: 0 lr: 0.00022749999999999997 loss: -0.9465005993843079
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 10 iteration: 50 lr: 0.00022749999999999997 loss: -0.1919674426317215
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 10 iteration: 100 lr: 0.00022749999999999997 loss: 0.0656488761305809
tau_img: 0.0066, tau_txt: 0.0063
Epoch: 10 iteration: 150 lr: 0.00022749999999999997 loss: 0.15473569929599762
tau_img: 0.0066, tau_txt: 0.0063
Epoch: 10 iteration: 200 lr: 0.00022749999999999997 loss: 0.048671215772628784
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 10 iteration: 250 lr: 0.00022749999999999997 loss: 0.05919775739312172
tau_img: 0.0066, tau_txt: 0.0063
Computing features for evaluation...
Computing features for evaluation...
Epoch: 10
coco val: {'txt_r1': 17.54, 'txt_r5': 39.96, 'txt_r10': 52.46, 'txt_r_mean': 36.653333333333336, 'img_r1': 12.039184326269492, 'img_r5': 30.89564174330268, 'img_r10': 42.55897640943623, 'img_r_mean': 28.497934159669466, 'r_mean': 32.5756337465014}
coco test: {'txt_r1': 17.24, 'txt_r5': 38.94, 'txt_r10': 51.24, 'txt_r_mean': 35.806666666666665, 'img_r1': 12.191123550579768, 'img_r5': 30.947620951619353, 'img_r10': 42.958816473410636, 'img_r_mean': 28.69918699186992, 'r_mean': 32.25292682926829}
Epoch: 11 iteration: 0 lr: 0.00021397681324599103 loss: -0.8527200222015381
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 11 iteration: 50 lr: 0.00021397681324599103 loss: -0.310724675655365
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 11 iteration: 100 lr: 0.00021397681324599103 loss: -0.18071337044239044
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 11 iteration: 150 lr: 0.00021397681324599103 loss: -0.15896828472614288
tau_img: 0.0067, tau_txt: 0.0064
Epoch: 11 iteration: 200 lr: 0.00021397681324599103 loss: 0.125459223985672
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 11 iteration: 250 lr: 0.00021397681324599103 loss: 0.005948394536972046
tau_img: 0.0066, tau_txt: 0.0064
Computing features for evaluation...
Computing features for evaluation...
Epoch: 11
coco val: {'txt_r1': 18.22, 'txt_r5': 40.72, 'txt_r10': 53.08, 'txt_r_mean': 37.339999999999996, 'img_r1': 12.367053178728508, 'img_r5': 31.231507397041185, 'img_r10': 42.890843662534984, 'img_r_mean': 28.829801412768223, 'r_mean': 33.08490070638411}
coco test: {'txt_r1': 19.12, 'txt_r5': 40.38, 'txt_r10': 52.52, 'txt_r_mean': 37.34, 'img_r1': 12.29908036785286, 'img_r5': 31.215513794482206, 'img_r10': 43.082766893242706, 'img_r_mean': 28.865787018525925, 'r_mean': 33.10289350926296}
Epoch: 12 iteration: 0 lr: 0.00019980746418436736 loss: -0.8759943246841431
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 12 iteration: 50 lr: 0.00019980746418436736 loss: -0.6733912229537964
tau_img: 0.0067, tau_txt: 0.0064
Epoch: 12 iteration: 100 lr: 0.00019980746418436736 loss: 0.007951691746711731
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 12 iteration: 150 lr: 0.00019980746418436736 loss: -0.27293896675109863
tau_img: 0.0066, tau_txt: 0.0064
Epoch: 12 iteration: 200 lr: 0.00019980746418436736 loss: -0.604184627532959
tau_img: 0.0067, tau_txt: 0.0065
Epoch: 12 iteration: 250 lr: 0.00019980746418436736 loss: -0.08432623744010925
tau_img: 0.0066, tau_txt: 0.0064
Computing features for evaluation...
Computing features for evaluation...
Epoch: 12
coco val: {'txt_r1': 18.26, 'txt_r5': 40.38, 'txt_r10': 53.12, 'txt_r_mean': 37.25333333333333, 'img_r1': 12.522990803678528, 'img_r5': 31.70731707317073, 'img_r10': 43.122750899640145, 'img_r_mean': 29.117686258829803, 'r_mean': 33.18550979608157}
coco test: {'txt_r1': 17.34, 'txt_r5': 39.08, 'txt_r10': 52.32, 'txt_r_mean': 36.24666666666667, 'img_r1': 12.798880447820872, 'img_r5': 31.759296281487405, 'img_r10': 43.05477808876449, 'img_r_mean': 29.204318272690927, 'r_mean': 32.7254924696788}
Epoch: 13 iteration: 0 lr: 0.00018514719516857505 loss: -1.3101189136505127
tau_img: 0.0069, tau_txt: 0.0066
Epoch: 13 iteration: 50 lr: 0.00018514719516857505 loss: -0.5373433828353882
tau_img: 0.0068, tau_txt: 0.0065
Epoch: 13 iteration: 100 lr: 0.00018514719516857505 loss: -0.2286771833896637
tau_img: 0.0068, tau_txt: 0.0065
Epoch: 13 iteration: 150 lr: 0.00018514719516857505 loss: -0.17678964138031006
tau_img: 0.0067, tau_txt: 0.0064
Epoch: 13 iteration: 200 lr: 0.00018514719516857505 loss: -0.24495404958724976
tau_img: 0.0068, tau_txt: 0.0066
Epoch: 13 iteration: 250 lr: 0.00018514719516857505 loss: -0.5934573411941528
tau_img: 0.0068, tau_txt: 0.0066
Computing features for evaluation...
Computing features for evaluation...
Epoch: 13
coco val: {'txt_r1': 19.1, 'txt_r5': 40.84, 'txt_r10': 53.04, 'txt_r_mean': 37.660000000000004, 'img_r1': 12.538984406237505, 'img_r5': 31.36345461815274, 'img_r10': 42.94282287085166, 'img_r_mean': 28.9484206317473, 'r_mean': 33.304210315873654}
coco test: {'txt_r1': 18.26, 'txt_r5': 40.74, 'txt_r10': 53.12, 'txt_r_mean': 37.373333333333335, 'img_r1': 12.810875649740105, 'img_r5': 31.955217912834865, 'img_r10': 43.398640543782484, 'img_r_mean': 29.388244702119152, 'r_mean': 33.380789017726244}
Epoch: 14 iteration: 0 lr: 0.00017015662717380974 loss: -1.136932611465454
tau_img: 0.0069, tau_txt: 0.0067
Epoch: 14 iteration: 50 lr: 0.00017015662717380974 loss: -1.2352209091186523
tau_img: 0.0071, tau_txt: 0.0068
Epoch: 14 iteration: 100 lr: 0.00017015662717380974 loss: -0.3656700551509857
tau_img: 0.0069, tau_txt: 0.0067
Epoch: 14 iteration: 150 lr: 0.00017015662717380974 loss: -0.7482412457466125
tau_img: 0.0068, tau_txt: 0.0066
Epoch: 14 iteration: 200 lr: 0.00017015662717380974 loss: -0.6269024014472961
tau_img: 0.0070, tau_txt: 0.0068
Epoch: 14 iteration: 250 lr: 0.00017015662717380974 loss: -0.8550422191619873
tau_img: 0.0070, tau_txt: 0.0067
Computing features for evaluation...
Computing features for evaluation...
Epoch: 14
coco val: {'txt_r1': 18.52, 'txt_r5': 39.8, 'txt_r10': 52.78, 'txt_r_mean': 37.03333333333333, 'img_r1': 12.758896441423431, 'img_r5': 32.279088364654136, 'img_r10': 44.17433026789284, 'img_r_mean': 29.73743835799014, 'r_mean': 33.38538584566174}
coco test: {'txt_r1': 17.52, 'txt_r5': 39.96, 'txt_r10': 51.94, 'txt_r_mean': 36.473333333333336, 'img_r1': 12.902838864454218, 'img_r5': 31.887245101959216, 'img_r10': 43.750499800079965, 'img_r_mean': 29.513527922164467, 'r_mean': 32.9934306277489}
Epoch: 15 iteration: 0 lr: 0.000155 loss: -1.8559613227844238
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 15 iteration: 50 lr: 0.000155 loss: -1.2427170276641846
tau_img: 0.0073, tau_txt: 0.0070
Epoch: 15 iteration: 100 lr: 0.000155 loss: -1.1395246982574463
tau_img: 0.0072, tau_txt: 0.0070
Epoch: 15 iteration: 150 lr: 0.000155 loss: -1.4752817153930664
tau_img: 0.0072, tau_txt: 0.0069
Epoch: 15 iteration: 200 lr: 0.000155 loss: -1.8828952312469482
tau_img: 0.0072, tau_txt: 0.0070
Epoch: 15 iteration: 250 lr: 0.000155 loss: -1.181127905845642
tau_img: 0.0072, tau_txt: 0.0070
Computing features for evaluation...
Computing features for evaluation...
Epoch: 15
coco val: {'txt_r1': 19.9, 'txt_r5': 43.36, 'txt_r10': 55.22, 'txt_r_mean': 39.49333333333333, 'img_r1': 13.478608556577369, 'img_r5': 32.810875649740105, 'img_r10': 44.40223910435826, 'img_r_mean': 30.230574436891914, 'r_mean': 34.86195388511263}
coco test: {'txt_r1': 19.58, 'txt_r5': 43.1, 'txt_r10': 54.84, 'txt_r_mean': 39.17333333333334, 'img_r1': 13.642542982806876, 'img_r5': 33.218712514994, 'img_r10': 44.718112754898044, 'img_r_mean': 30.526456084232976, 'r_mean': 34.849894708783154}
Epoch: 16 iteration: 0 lr: 0.00013984337282619026 loss: -2.054107189178467
tau_img: 0.0073, tau_txt: 0.0072
Epoch: 16 iteration: 50 lr: 0.00013984337282619026 loss: -1.3603992462158203
tau_img: 0.0073, tau_txt: 0.0071
Epoch: 16 iteration: 100 lr: 0.00013984337282619026 loss: -1.8992851972579956
tau_img: 0.0074, tau_txt: 0.0071
Epoch: 16 iteration: 150 lr: 0.00013984337282619026 loss: -1.8692710399627686
tau_img: 0.0074, tau_txt: 0.0072
Epoch: 16 iteration: 200 lr: 0.00013984337282619026 loss: -1.7104038000106812
tau_img: 0.0075, tau_txt: 0.0072
Epoch: 16 iteration: 250 lr: 0.00013984337282619026 loss: -1.380126953125
tau_img: 0.0073, tau_txt: 0.0071
Computing features for evaluation...
Computing features for evaluation...
Epoch: 16
coco val: {'txt_r1': 20.58, 'txt_r5': 43.24, 'txt_r10': 55.3, 'txt_r_mean': 39.70666666666667, 'img_r1': 13.15873650539784, 'img_r5': 32.99480207916833, 'img_r10': 44.586165533786485, 'img_r_mean': 30.24656803945088, 'r_mean': 34.97661735305878}
coco test: {'txt_r1': 19.36, 'txt_r5': 42.4, 'txt_r10': 54.48, 'txt_r_mean': 38.74666666666666, 'img_r1': 13.666533386645343, 'img_r5': 33.2546981207517, 'img_r10': 44.65413834466214, 'img_r_mean': 30.525123284019724, 'r_mean': 34.63589497534319}
Epoch: 17 iteration: 0 lr: 0.00012485280483142487 loss: -2.5637669563293457
tau_img: 0.0075, tau_txt: 0.0073
Epoch: 17 iteration: 50 lr: 0.00012485280483142487 loss: -2.191415309906006
tau_img: 0.0078, tau_txt: 0.0075
Epoch: 17 iteration: 100 lr: 0.00012485280483142487 loss: -2.321763515472412
tau_img: 0.0077, tau_txt: 0.0074
Epoch: 17 iteration: 150 lr: 0.00012485280483142487 loss: -1.8449326753616333
tau_img: 0.0075, tau_txt: 0.0073
Epoch: 17 iteration: 200 lr: 0.00012485280483142487 loss: -2.31805157661438
tau_img: 0.0077, tau_txt: 0.0075
Epoch: 17 iteration: 250 lr: 0.00012485280483142487 loss: -2.372451066970825
tau_img: 0.0075, tau_txt: 0.0073
Computing features for evaluation...
Computing features for evaluation...
Epoch: 17
coco val: {'txt_r1': 19.98, 'txt_r5': 42.34, 'txt_r10': 54.76, 'txt_r_mean': 39.02666666666667, 'img_r1': 13.554578168732506, 'img_r5': 33.04278288684526, 'img_r10': 44.470211915233904, 'img_r_mean': 30.35585765693722, 'r_mean': 34.691262161801944}
coco test: {'txt_r1': 19.42, 'txt_r5': 42.38, 'txt_r10': 54.96, 'txt_r_mean': 38.92, 'img_r1': 13.838464614154338, 'img_r5': 33.334666133546584, 'img_r10': 44.92602958816473, 'img_r_mean': 30.69972011195522, 'r_mean': 34.80986005597761}
Epoch: 18 iteration: 0 lr: 0.00011019253581563262 loss: -4.260552406311035
tau_img: 0.0081, tau_txt: 0.0079
Epoch: 18 iteration: 50 lr: 0.00011019253581563262 loss: -2.9299917221069336
tau_img: 0.0081, tau_txt: 0.0078
Epoch: 18 iteration: 100 lr: 0.00011019253581563262 loss: -3.3400635719299316
tau_img: 0.0080, tau_txt: 0.0077
Epoch: 18 iteration: 150 lr: 0.00011019253581563262 loss: -3.453747510910034
tau_img: 0.0079, tau_txt: 0.0077
Epoch: 18 iteration: 200 lr: 0.00011019253581563262 loss: -3.1733462810516357
tau_img: 0.0081, tau_txt: 0.0078
Epoch: 18 iteration: 250 lr: 0.00011019253581563262 loss: -2.6329762935638428
tau_img: 0.0079, tau_txt: 0.0076
Computing features for evaluation...
Computing features for evaluation...
Epoch: 18
coco val: {'txt_r1': 20.76, 'txt_r5': 43.36, 'txt_r10': 55.6, 'txt_r_mean': 39.906666666666666, 'img_r1': 14.226309476209517, 'img_r5': 33.80647740903638, 'img_r10': 45.27788884446222, 'img_r_mean': 31.103558576569373, 'r_mean': 35.50511262161802}
coco test: {'txt_r1': 20.6, 'txt_r5': 43.26, 'txt_r10': 55.16, 'txt_r_mean': 39.67333333333333, 'img_r1': 14.406237504998002, 'img_r5': 34.25029988004798, 'img_r10': 45.76169532187125, 'img_r_mean': 31.47274423563908, 'r_mean': 35.57303878448621}
Epoch: 19 iteration: 0 lr: 9.602318675400897e-05 loss: -4.915426254272461
tau_img: 0.0085, tau_txt: 0.0082
Epoch: 19 iteration: 50 lr: 9.602318675400897e-05 loss: -3.8118224143981934
tau_img: 0.0083, tau_txt: 0.0082
Epoch: 19 iteration: 100 lr: 9.602318675400897e-05 loss: -3.6978960037231445
tau_img: 0.0083, tau_txt: 0.0080
Epoch: 19 iteration: 150 lr: 9.602318675400897e-05 loss: -3.7106001377105713
tau_img: 0.0082, tau_txt: 0.0080
Epoch: 19 iteration: 200 lr: 9.602318675400897e-05 loss: -4.195495128631592
tau_img: 0.0083, tau_txt: 0.0080
Epoch: 19 iteration: 250 lr: 9.602318675400897e-05 loss: -4.262701034545898
tau_img: 0.0083, tau_txt: 0.0081
Computing features for evaluation...
Computing features for evaluation...
Epoch: 19
coco val: {'txt_r1': 19.98, 'txt_r5': 43.22, 'txt_r10': 55.22, 'txt_r_mean': 39.473333333333336, 'img_r1': 14.058376649340264, 'img_r5': 33.310675729708116, 'img_r10': 44.96601359456218, 'img_r_mean': 30.778355324536857, 'r_mean': 35.125844328935095}
coco test: {'txt_r1': 19.94, 'txt_r5': 43.16, 'txt_r10': 55.72, 'txt_r_mean': 39.60666666666666, 'img_r1': 13.94642143142743, 'img_r5': 33.65053978408636, 'img_r10': 45.33386645341863, 'img_r_mean': 30.976942556310807, 'r_mean': 35.291804611488736}
Epoch: 20 iteration: 0 lr: 8.250000000000001e-05 loss: -4.490512371063232
tau_img: 0.0085, tau_txt: 0.0084
Epoch: 20 iteration: 50 lr: 8.250000000000001e-05 loss: -5.540229320526123
tau_img: 0.0088, tau_txt: 0.0085
Epoch: 20 iteration: 100 lr: 8.250000000000001e-05 loss: -5.427042484283447
tau_img: 0.0088, tau_txt: 0.0085
Epoch: 20 iteration: 150 lr: 8.250000000000001e-05 loss: -5.009304046630859
tau_img: 0.0087, tau_txt: 0.0085
Epoch: 20 iteration: 200 lr: 8.250000000000001e-05 loss: -5.154559135437012
tau_img: 0.0088, tau_txt: 0.0084
Epoch: 20 iteration: 250 lr: 8.250000000000001e-05 loss: -5.245851993560791
tau_img: 0.0087, tau_txt: 0.0085
Computing features for evaluation...
Computing features for evaluation...
Epoch: 20
coco val: {'txt_r1': 21.16, 'txt_r5': 43.64, 'txt_r10': 55.96, 'txt_r_mean': 40.25333333333333, 'img_r1': 13.914434226309476, 'img_r5': 33.954418232706914, 'img_r10': 45.64574170331867, 'img_r_mean': 31.171531387445018, 'r_mean': 35.71243236038917}
coco test: {'txt_r1': 20.46, 'txt_r5': 43.9, 'txt_r10': 55.6, 'txt_r_mean': 39.98666666666667, 'img_r1': 14.166333466613354, 'img_r5': 34.44622151139544, 'img_r10': 45.7936825269892, 'img_r_mean': 31.46874583499933, 'r_mean': 35.727706250833}
Epoch: 21 iteration: 0 lr: 6.97711384175914e-05 loss: -6.665648460388184
tau_img: 0.0093, tau_txt: 0.0089
Epoch: 21 iteration: 50 lr: 6.97711384175914e-05 loss: -5.873527526855469
tau_img: 0.0089, tau_txt: 0.0088
Epoch: 21 iteration: 100 lr: 6.97711384175914e-05 loss: -6.627588272094727
tau_img: 0.0091, tau_txt: 0.0090
Epoch: 21 iteration: 150 lr: 6.97711384175914e-05 loss: -6.532419204711914
tau_img: 0.0093, tau_txt: 0.0091
Epoch: 21 iteration: 200 lr: 6.97711384175914e-05 loss: -6.612300395965576
tau_img: 0.0092, tau_txt: 0.0090
Epoch: 21 iteration: 250 lr: 6.97711384175914e-05 loss: -5.026062965393066
tau_img: 0.0088, tau_txt: 0.0085
Computing features for evaluation...
Computing features for evaluation...
Epoch: 21
coco val: {'txt_r1': 21.2, 'txt_r5': 42.88, 'txt_r10': 55.18, 'txt_r_mean': 39.75333333333333, 'img_r1': 13.858456617353058, 'img_r5': 33.5265893642543, 'img_r10': 45.12994802079168, 'img_r_mean': 30.838331334133013, 'r_mean': 35.29583233373317}
coco test: {'txt_r1': 19.56, 'txt_r5': 42.92, 'txt_r10': 54.92, 'txt_r_mean': 39.13333333333333, 'img_r1': 14.082367053178729, 'img_r5': 33.506597361055576, 'img_r10': 45.16993202718913, 'img_r_mean': 30.919632147141144, 'r_mean': 35.02648274023724}
Epoch: 22 iteration: 0 lr: 5.797606207796559e-05 loss: -7.0506157875061035
tau_img: 0.0095, tau_txt: 0.0091
Epoch: 22 iteration: 50 lr: 5.797606207796559e-05 loss: -7.07581901550293
tau_img: 0.0093, tau_txt: 0.0090
Epoch: 22 iteration: 100 lr: 5.797606207796559e-05 loss: -7.153095245361328
tau_img: 0.0096, tau_txt: 0.0093
Epoch: 22 iteration: 150 lr: 5.797606207796559e-05 loss: -7.888920307159424
tau_img: 0.0096, tau_txt: 0.0094
Epoch: 22 iteration: 200 lr: 5.797606207796559e-05 loss: -6.130715847015381
tau_img: 0.0092, tau_txt: 0.0090
Epoch: 22 iteration: 250 lr: 5.797606207796559e-05 loss: -6.484936714172363
tau_img: 0.0093, tau_txt: 0.0089
Computing features for evaluation...
Computing features for evaluation...
Epoch: 22
coco val: {'txt_r1': 20.78, 'txt_r5': 43.78, 'txt_r10': 55.14, 'txt_r_mean': 39.9, 'img_r1': 14.338264694122351, 'img_r5': 34.038384646141544, 'img_r10': 45.71771291483407, 'img_r_mean': 31.36478741836599, 'r_mean': 35.63239370918299}
coco test: {'txt_r1': 20.3, 'txt_r5': 42.74, 'txt_r10': 55.12, 'txt_r_mean': 39.38666666666666, 'img_r1': 14.326269492203119, 'img_r5': 34.3062774890044, 'img_r10': 45.649740103958415, 'img_r_mean': 31.427429028388644, 'r_mean': 35.40704784752765}
Epoch: 23 iteration: 0 lr: 4.724400030577786e-05 loss: -9.242505073547363
tau_img: 0.0100, tau_txt: 0.0099
Epoch: 23 iteration: 50 lr: 4.724400030577786e-05 loss: -8.627782821655273
tau_img: 0.0097, tau_txt: 0.0094
Epoch: 23 iteration: 100 lr: 4.724400030577786e-05 loss: -8.229507446289062
tau_img: 0.0098, tau_txt: 0.0095
Epoch: 23 iteration: 150 lr: 4.724400030577786e-05 loss: -8.095161437988281
tau_img: 0.0101, tau_txt: 0.0099
Epoch: 23 iteration: 200 lr: 4.724400030577786e-05 loss: -7.361606597900391
tau_img: 0.0099, tau_txt: 0.0096
Epoch: 23 iteration: 250 lr: 4.724400030577786e-05 loss: -8.183349609375
tau_img: 0.0096, tau_txt: 0.0095
Computing features for evaluation...
Computing features for evaluation...
Epoch: 23
coco val: {'txt_r1': 20.56, 'txt_r5': 43.82, 'txt_r10': 55.32, 'txt_r_mean': 39.9, 'img_r1': 14.066373450619752, 'img_r5': 33.7984806077569, 'img_r10': 45.86965213914434, 'img_r_mean': 31.244835399173667, 'r_mean': 35.572417699586836}
coco test: {'txt_r1': 19.68, 'txt_r5': 43.02, 'txt_r10': 54.9, 'txt_r_mean': 39.199999999999996, 'img_r1': 14.374250299880048, 'img_r5': 34.16233506597361, 'img_r10': 45.71771291483407, 'img_r_mean': 31.418099426895907, 'r_mean': 35.30904971344795}
Epoch: 24 iteration: 0 lr: 3.769253581563263e-05 loss: -10.245454788208008
tau_img: 0.0102, tau_txt: 0.0099
Epoch: 24 iteration: 50 lr: 3.769253581563263e-05 loss: -9.013447761535645
tau_img: 0.0102, tau_txt: 0.0100
Epoch: 24 iteration: 100 lr: 3.769253581563263e-05 loss: -10.611595153808594
tau_img: 0.0104, tau_txt: 0.0101
Epoch: 24 iteration: 150 lr: 3.769253581563263e-05 loss: -8.743675231933594
tau_img: 0.0102, tau_txt: 0.0102
Epoch: 24 iteration: 200 lr: 3.769253581563263e-05 loss: -8.715897560119629
tau_img: 0.0102, tau_txt: 0.0099
Epoch: 24 iteration: 250 lr: 3.769253581563263e-05 loss: -10.123720169067383
tau_img: 0.0102, tau_txt: 0.0101
Computing features for evaluation...
Computing features for evaluation...
Epoch: 24
coco val: {'txt_r1': 20.3, 'txt_r5': 43.44, 'txt_r10': 54.48, 'txt_r_mean': 39.406666666666666, 'img_r1': 14.110355857656938, 'img_r5': 33.662534986005596, 'img_r10': 45.59776089564174, 'img_r_mean': 31.123550579768093, 'r_mean': 35.26510862321738}
coco test: {'txt_r1': 19.24, 'txt_r5': 42.34, 'txt_r10': 55.02, 'txt_r_mean': 38.86666666666667, 'img_r1': 14.466213514594163, 'img_r5': 33.75449820071971, 'img_r10': 45.529788084766096, 'img_r_mean': 31.250166600026656, 'r_mean': 35.05841663334666}
Epoch: 25 iteration: 0 lr: 2.9426316451256386e-05 loss: -11.852662086486816
tau_img: 0.0108, tau_txt: 0.0105
Epoch: 25 iteration: 50 lr: 2.9426316451256386e-05 loss: -11.105792045593262
tau_img: 0.0108, tau_txt: 0.0105
Epoch: 25 iteration: 100 lr: 2.9426316451256386e-05 loss: -9.328715324401855
tau_img: 0.0103, tau_txt: 0.0100
Epoch: 25 iteration: 150 lr: 2.9426316451256386e-05 loss: -10.47180461883545
tau_img: 0.0105, tau_txt: 0.0101
Epoch: 25 iteration: 200 lr: 2.9426316451256386e-05 loss: -9.260772705078125
tau_img: 0.0104, tau_txt: 0.0103
Epoch: 25 iteration: 250 lr: 2.9426316451256386e-05 loss: -10.207618713378906
tau_img: 0.0103, tau_txt: 0.0102
Computing features for evaluation...
Computing features for evaluation...
Epoch: 25
coco val: {'txt_r1': 20.44, 'txt_r5': 43.78, 'txt_r10': 55.58, 'txt_r_mean': 39.93333333333333, 'img_r1': 14.146341463414634, 'img_r5': 33.81447421031587, 'img_r10': 46.00559776089564, 'img_r_mean': 31.32213781154205, 'r_mean': 35.627735572437686}
coco test: {'txt_r1': 19.66, 'txt_r5': 42.9, 'txt_r10': 55.24, 'txt_r_mean': 39.26666666666667, 'img_r1': 14.47421031587365, 'img_r5': 34.27828868452619, 'img_r10': 45.725709716113556, 'img_r_mean': 31.492736238837796, 'r_mean': 35.379701452752236}
Epoch: 26 iteration: 0 lr: 2.2535908641822855e-05 loss: -10.570426940917969
tau_img: 0.0106, tau_txt: 0.0105
Epoch: 26 iteration: 50 lr: 2.2535908641822855e-05 loss: -11.204402923583984
tau_img: 0.0110, tau_txt: 0.0107
Epoch: 26 iteration: 100 lr: 2.2535908641822855e-05 loss: -12.513148307800293
tau_img: 0.0110, tau_txt: 0.0108
Epoch: 26 iteration: 150 lr: 2.2535908641822855e-05 loss: -11.783784866333008
tau_img: 0.0110, tau_txt: 0.0108
Epoch: 26 iteration: 200 lr: 2.2535908641822855e-05 loss: -11.702966690063477
tau_img: 0.0111, tau_txt: 0.0107
Epoch: 26 iteration: 250 lr: 2.2535908641822855e-05 loss: -11.340032577514648
tau_img: 0.0111, tau_txt: 0.0110
Computing features for evaluation...
Computing features for evaluation...
Epoch: 26
coco val: {'txt_r1': 20.7, 'txt_r5': 43.74, 'txt_r10': 55.58, 'txt_r_mean': 40.00666666666667, 'img_r1': 14.134346261495402, 'img_r5': 33.78248700519792, 'img_r10': 45.657736905237904, 'img_r_mean': 31.19152339064374, 'r_mean': 35.599095028655206}
coco test: {'txt_r1': 19.48, 'txt_r5': 42.86, 'txt_r10': 55.32, 'txt_r_mean': 39.22, 'img_r1': 14.29828068772491, 'img_r5': 33.98640543782487, 'img_r10': 45.55377848860456, 'img_r_mean': 31.27948820471811, 'r_mean': 35.24974410235905}
Epoch: 27 iteration: 0 lr: 1.7096805137202738e-05 loss: -12.180134773254395
tau_img: 0.0114, tau_txt: 0.0113
Epoch: 27 iteration: 50 lr: 1.7096805137202738e-05 loss: -12.57005500793457
tau_img: 0.0112, tau_txt: 0.0110
Epoch: 27 iteration: 100 lr: 1.7096805137202738e-05 loss: -12.195676803588867
tau_img: 0.0115, tau_txt: 0.0113
Epoch: 27 iteration: 150 lr: 1.7096805137202738e-05 loss: -13.575706481933594
tau_img: 0.0116, tau_txt: 0.0113
Epoch: 27 iteration: 200 lr: 1.7096805137202738e-05 loss: -14.225406646728516
tau_img: 0.0115, tau_txt: 0.0113
Epoch: 27 iteration: 250 lr: 1.7096805137202738e-05 loss: -11.519415855407715
tau_img: 0.0113, tau_txt: 0.0111
Computing features for evaluation...
Computing features for evaluation...
Epoch: 27
coco val: {'txt_r1': 20.72, 'txt_r5': 44.04, 'txt_r10': 55.4, 'txt_r_mean': 40.053333333333335, 'img_r1': 14.186325469812076, 'img_r5': 33.71451419432227, 'img_r10': 45.63374650139944, 'img_r_mean': 31.178195388511266, 'r_mean': 35.6157643609223}
coco test: {'txt_r1': 19.42, 'txt_r5': 42.88, 'txt_r10': 55.08, 'txt_r_mean': 39.126666666666665, 'img_r1': 14.50219912035186, 'img_r5': 33.982407037185126, 'img_r10': 45.569772091163536, 'img_r_mean': 31.351459416233507, 'r_mean': 35.23906304145009}
Epoch: 28 iteration: 0 lr: 1.3168597893598175e-05 loss: -14.22984790802002
tau_img: 0.0116, tau_txt: 0.0115
Epoch: 28 iteration: 50 lr: 1.3168597893598175e-05 loss: -12.658186912536621
tau_img: 0.0117, tau_txt: 0.0115
Epoch: 28 iteration: 100 lr: 1.3168597893598175e-05 loss: -14.149580001831055
tau_img: 0.0117, tau_txt: 0.0114
Epoch: 28 iteration: 150 lr: 1.3168597893598175e-05 loss: -14.180305480957031
tau_img: 0.0119, tau_txt: 0.0115
Epoch: 28 iteration: 200 lr: 1.3168597893598175e-05 loss: -14.528634071350098
tau_img: 0.0121, tau_txt: 0.0118
Epoch: 28 iteration: 250 lr: 1.3168597893598175e-05 loss: -14.142889022827148
tau_img: 0.0120, tau_txt: 0.0116
Computing features for evaluation...
Computing features for evaluation...
Epoch: 28
coco val: {'txt_r1': 20.56, 'txt_r5': 43.92, 'txt_r10': 55.18, 'txt_r_mean': 39.88666666666666, 'img_r1': 14.378248700519793, 'img_r5': 33.990403838464616, 'img_r10': 45.81767293082767, 'img_r_mean': 31.39544182327069, 'r_mean': 35.64105424496868}
coco test: {'txt_r1': 19.56, 'txt_r5': 42.92, 'txt_r10': 55.0, 'txt_r_mean': 39.160000000000004, 'img_r1': 14.550179928028788, 'img_r5': 34.11435425829668, 'img_r10': 45.765693722510996, 'img_r_mean': 31.476742636278818, 'r_mean': 35.31837131813941}
Epoch: 29 iteration: 0 lr: 1.0794325171600358e-05 loss: -14.580052375793457
tau_img: 0.0120, tau_txt: 0.0117
Epoch: 29 iteration: 50 lr: 1.0794325171600358e-05 loss: -14.782979965209961
tau_img: 0.0124, tau_txt: 0.0122
Epoch: 29 iteration: 100 lr: 1.0794325171600358e-05 loss: -13.903106689453125
tau_img: 0.0121, tau_txt: 0.0118
Epoch: 29 iteration: 150 lr: 1.0794325171600358e-05 loss: -15.160087585449219
tau_img: 0.0125, tau_txt: 0.0121
Epoch: 29 iteration: 200 lr: 1.0794325171600358e-05 loss: -14.430315017700195
tau_img: 0.0118, tau_txt: 0.0117
Epoch: 29 iteration: 250 lr: 1.0794325171600358e-05 loss: -14.369138717651367
tau_img: 0.0120, tau_txt: 0.0118
Computing features for evaluation...
Computing features for evaluation...
Epoch: 29
coco val: {'txt_r1': 20.42, 'txt_r5': 43.82, 'txt_r10': 55.34, 'txt_r_mean': 39.86000000000001, 'img_r1': 14.234306277489004, 'img_r5': 33.750499800079965, 'img_r10': 45.48180727708917, 'img_r_mean': 31.155537784886047, 'r_mean': 35.507768892443025}
coco test: {'txt_r1': 19.4, 'txt_r5': 42.76, 'txt_r10': 55.08, 'txt_r_mean': 39.08, 'img_r1': 14.434226309476209, 'img_r5': 33.8984406237505, 'img_r10': 45.577768892443025, 'img_r_mean': 31.303478608556578, 'r_mean': 35.191739304278286}

Visualization

In order to compare the performance of different algorithms, we also train CLIP models using OpenCLIP, and the notebook is available here. Here we demonstrate the training curves of the mean validation recall values for CLIP and iSogCLR.

clip_recall_vals = [9.56793, 26.4037, 29.3343, 29.7682, 30.5586, 30.8398, 30.8938, 31.5624, 30.5864, 31.1057, 31.775, 31.0977, 31.8895, 31.0098, 31.2123, 31.745, 31.8129, 32.0983, 31.163, 31.7523, 32.1975, 32.2302, 32.5441, 32.3621, 32.6608, 32.7541, 32.7528, 32.4954, 32.9994, 32.7601]
isogclr_recall_vals = [10.3831, 22.1452, 26.797, 29.2797, 30.6854, 31.7613, 32.4364, 32.3897, 31.5938, 32.287, 32.5756, 33.0849, 33.1855, 33.3042, 33.3853, 34.8619, 34.9766, 34.6912, 35.5051, 35.1258, 35.7124, 35.2958, 35.6323, 35.5724, 35.2651, 35.6277, 35.599, 35.6157, 35.641, 35.5077]

import matplotlib.pyplot as plt
import numpy as np

epochs = np.arange(1, 31)

plt.plot(epochs, clip_recall_vals, label='OpenCLIP', ls=':', marker='+', color='blue')
plt.plot(epochs, isogclr_recall_vals, label='iSogCLR', marker='*', color='orange')

plt.ylabel('Mean Validation Recall', fontsize=18)
plt.xlabel('Epoch', fontsize=18)

plt.legend(fontsize=20)

plt.show()