(CF2,包含尺度变化)tracker_ensemble程序简介

%tracker_ensemble: Correlation filter tracking with convolutional features

% Input:

%   - video_path:          path to the image sequence

%   - img_files:           list of image names

%   - pos:                 intialized center position ofthe target in (row, col)

%   - target_sz:           intialized target size in (Height,Width)

%   - padding:             padding parameter for the searcharea

%   - lambda:              regularization term for ridgeregression

%   - output_sigma_factor: spatial bandwidth forthe Gaussian label

%   - interp_factor:       learning rate for model update

%   - cell_size:           spatial quantization level

%   - show_visualization:  set to True for showing intermediate results

% Output:

%   - positions:           predicted target position at each frame

%   - time:                time spent for tracking

%

 

function [positions, time] = tracker_ensemble(video_path,img_files, pos, target_sz,...

    padding, lambda, output_sigma_factor,interp_factor, cell_size, show_visualization)

 

 

%================================================================================

% Environment setting环境变量设置

%================================================================================

 

indLayers = [37,28, 19];   % relu5-4, relu4-4, andrelu3-4 in VGG Net

nweights  = [1, 0.5, 0.25]; % Weights for combining correlation filter responses

numLayers =length(indLayers);

 

% Get imagesize and search window size

im_sz     = size(imread([video_path img_files{1}]));

window_sz =get_search_window(target_sz, im_sz, padding);%以目标为中心,padding一圈得到搜索窗口

 

% Compute the sigma for the Gaussian function label 高斯核带宽

output_sigma =sqrt(prod(target_sz)) * output_sigma_factor / cell_size;

 

%createregression labels, gaussian shaped, with a bandwidth

%proportionalto target size    d=bsxfun(@times,c,[12]);

 

l1_patch_num =floor(window_sz/ cell_size);

%把window_sz缩小4倍成了62x61(MotorRolling)目标也缩小了四倍,减少计算量

 

% Pre-computethe Fourier Transform of the Gaussian function label

yf =fft2(gaussian_shaped_labels(output_sigma, l1_patch_num));%计算缩小后图像块每个像素点的高斯标签,

%FFT计算后,默认峰值是在图像中心,circshift把峰值移到四个角,其中左上角最大为1

 

% Pre-computeand cache the cosine window (for avoiding boundary discontinuity)

cos_window =hann(size(yf,1)) * hann(size(yf,2))';%余弦窗大小为62x61,每次都是在最后过滤一下,消除边界不连续

 

% Create videointerface for visualization

if(show_visualization)

    update_visualization =show_video(img_files, video_path);

end

 

% Initializevariables for calculating FPS and distance precision

time      = 0;

positions =zeros(numel(img_files), 2);%本函数最后返回的其中一个参数,是目标的中心位置坐标,格式[y,x]。

%rects = zeros(numel(img_files),4);

nweights  = reshape(nweights,1,1,[]);

 

% Note:variables ending with 'f' are in the Fourier domain.

model_xf     = cell(1, numLayers);%滤波器的两部分,和论文里边的A,B并不对应,详情看后边。

model_alphaf =cell(1, numLayers);

 

current_scale_factor=1;%当前尺度因子

 

%================================================================================

% Start tracking

%================================================================================

for frame = 1:numel(img_files),

    im = imread([video_path img_files{frame}]);% Load the image at the currentframe

    if ismatrix(im)

        im = cat(3, im, im, im);

    end

   

    tic(); %计时开始

    %================================================================================

    %Predicting the object position from the learned object model

    %================================================================================

    if frame > 1

        % Extracting hierarchical convolutional features

        feat = extractFeature(im, pos,window_sz, cos_window, indLayers);


        % Predict position

        pos = predictPosition(feat, pos, indLayers, nweights, cell_size,l1_patch_num,...

            model_xf, model_alphaf);

       

        % Scale estimation

        current_scale_factor = estimate_scale(rgb2gray(im), pos, current_scale_factor);  

        %将当前帧图像转灰度图像,利用前一帧的尺度因子,以新位置为中心在每个尺度层级(总共33层)上进行不同大小的图像块

        %尺度特征提取,这是第一次提取尺度特征(类似于前边的深度特征),然后与前边初始化过的尺度滤波器进行卷积找最大响应

        %(比如说,20层),找到最大响应所在层级对应的尺度因子,再乘以前帧已知尺度因子得到当前帧尺度因子,再利用当前

        %帧尺度因子提取特征(第二次提取特征),更新尺度滤波器。

    else

        init_scale_para(rgb2gray(im),target_sz, pos);

        %提取尺度特征,构建尺度滤波器(例如设置尺度窗口psdding、cellsize(本文里边更像一种比例系数,恩,可以

        %比地图比例尺)、搜索窗口window_sz大小、构建尺度滤波所需要的各种参数。

    end

   

    %================================================================================

    %Learning correlation filters over hierarchical convolutional features

    %================================================================================

    %Extracting hierarchical convolutional features

    feat = extractFeature(im, pos, window_sz, cos_window, indLayers);

    

    %Model update

    [model_xf, model_alphaf] =updateModel(feat, yf, interp_factor, lambda, frame,...    %滤波器的初始化更新

        model_xf, model_alphaf);

   

    %================================================================================

    %Save predicted position and timing

    %================================================================================

    positions(frame,:) = pos;%从第二帧起,每估计一次位置就存到这个数组里边。

    target_sz_t=target_sz*current_scale_factor;%第一帧的pos和target_sz都是直接从groundtruth中拿出来的

    box = [pos([2,1]) - target_sz_t([2,1])/2,target_sz_t([2,1])];%x,y,w,h,其实就是咱们估计到的位置,在图像上画框用

%     rects(frame,:)=box;

   

    time = time + toc(); % toc()计时开始

   

    %Visualization显示图像和边界框

    if show_visualization,

        %box = [pos([2,1]) - target_sz([2,1])/2,target_sz([2,1])];

        stop = update_visualization(frame,box);

        if stop,break,end %userpressed Esc, stop early

        drawnow

        %          pause(0.05)  % uncomment to runslower

    end

end

 

end

 

 

function pos = predictPosition(feat, pos, indLayers, nweights,cell_size, l1_patch_num,...

    model_xf, model_alphaf)

% pos是上一帧的位置,feat是以pos为中心提取的特征,l1_patch_num就是那个缩小到62x61的图像块,

% model_xf, model_alphaf是滤波器参数

%================================================================================

% Computecorrelation filter responses at each layer

%================================================================================

res_layer =zeros([l1_patch_num, length(indLayers)]);

 

for ii = 1 : length(indLayers)

    zf = fft2(feat{ii});  %特征是在window_sz缩小后的62x61图像块上,高频分量在四个角上

    kzf=sum(zf .* conj(model_xf{ii}), 3) /numel(zf);

    %跟论文公式相比多了个除以numel(zf),这是归一化,高频分量在四个角上

   

    temp= real(fftshift(ifft2(model_alphaf{ii}.* kzf))); %equation for fast detection

    %做完ifft2高频分量在四个角附近,fftshif的作用就是把四个角的高频分量挪到中心                                                                       

    res_layer(:,:,ii)=temp/max(temp(:));                                                                

end

 

% Combineresponses from multiple layers (see Eqn. 5)

response =sum(bsxfun(@times, res_layer, nweights), 3);%把三个响应集合进行加权融合

 

%================================================================================

% Find targetlocation

%================================================================================

% Target locationis at the maximum response. we must take into

% account thefact that, if the target doesn't move, the peak

% will appearat the top-left corner, not at the center (this is

% discussedin the KCF paper). The responses wrap around cyclically.

  

[vert_delta,horiz_delta] = find(response == max(response(:)), 1);%找出62x61图像块上最大响应的坐标

(CF2,包含尺度变化)tracker_ensemble程序简介


vert_delta  = vert_delta - floor(size(zf,1)/2);

horiz_delta =horiz_delta - floor(size(zf,2)/2);


% Map the position to the image space将最大响应点的坐标变换到整个360x640图像上,求其坐标。

pos = pos +cell_size * [vert_delta - 1, horiz_delta - 1]; 


end

 


function [model_xf, model_alphaf] = updateModel(feat, yf,interp_factor, lambda, frame,...

    model_xf, model_alphaf)

 

numLayers =length(feat);


%================================================================================

%Initialization

%================================================================================

xf       = cell(1, numLayers);

alphaf   = cell(1, numLayers);

 

% ================================================================================

% Modelupdate

论文中滤波器更新方式(以其中一个通道d为例说明)   

(CF2,包含尺度变化)tracker_ensemble程序简介

%================================================================================

for ii=1 : numLayers

    xf{ii} = fft2(feat{ii});%对三个层的特征分别进行2维傅里叶变换,高频分量都集中在四个角

    kf = sum(xf{ii} .* conj(xf{ii}), 3) /numel(xf{ii});%应该是归一化,反正在分母上,大家都一样。

    alphaf{ii} = yf./ (kf+ lambda);  %Fast training,高频分量都集中在四个角

end

 

% Modelinitialization or update

if frame == 1, % First frame, train with a single image

    for ii=1:numLayers

        model_alphaf{ii} = alphaf{ii};%滤波器模型初始化

          model_xf{ii} = xf{ii};

    end

else

    %Online model update using learning rate interp_factor

    for ii=1:numLayers

        model_alphaf{ii} = (1 - interp_factor)* model_alphaf{ii} + interp_factor * alphaf{ii};

        model_xf{ii}     = (1 - interp_factor) * model_xf{ii}     + interp_factor * xf{ii};

    end

end

 

 

end

 

function feat  =extractFeature(im, pos, window_sz, cos_window, indLayers)

%在原始图像im中提取,以pos为中心的大小为window_sz的图像块,缩放到224x224后,将该图像块通过CNN网络

%后提取indLayers三层特征,大小为62x61,并用余弦窗过滤,避免边界不连续。 另外,除了第一帧只提取一次特征

%用于初始化滤波器,从第二帧开始都是提取两次特征,第一次是在上一帧的位置处提取当前帧的特征用来与滤波器

%卷积求得当前帧位置,第二次是在新的位置提取当前帧特征用来更新滤波器。


% Get thesearch window from previous detection

patch =get_subwindow(im, pos, window_sz); 

% Extractinghierarchical convolutional features

feat  = get_features(patch, cos_window, indLayers);%提取分层特征

end