(CF2，包含尺度变化)tracker_ensemble程序简介

%tracker_ensemble: Correlation filter tracking with convolutional features

% Input:

% - video_path: path to the image sequence

% - img_files: list of image names

% - pos: intialized center position ofthe target in (row, col)

% - target_sz: intialized target size in (Height,Width)

% - padding: padding parameter for the searcharea

% - lambda: regularization term for ridgeregression

% - output_sigma_factor: spatial bandwidth forthe Gaussian label

% - interp_factor: learning rate for model update

% - cell_size: spatial quantization level

% - show_visualization: set to True for showing intermediate results

% Output:

% - positions: predicted target position at each frame

% - time: time spent for tracking

function [positions, time] = tracker_ensemble(video_path,img_files, pos, target_sz,...

padding, lambda, output_sigma_factor,interp_factor, cell_size, show_visualization)

%================================================================================

% Environment setting环境变量设置

%================================================================================

indLayers = [37,28, 19]; % relu5-4, relu4-4, andrelu3-4 in VGG Net

nweights = [1, 0.5, 0.25]; % Weights for combining correlation filter responses

numLayers =length(indLayers);

% Get imagesize and search window size

im_sz = size(imread([video_path img_files{1}]));

window_sz =get_search_window(target_sz, im_sz, padding);%以目标为中心，padding一圈得到搜索窗口

% Compute the sigma for the Gaussian function label 高斯核带宽

output_sigma =sqrt(prod(target_sz)) * output_sigma_factor / cell_size;

%createregression labels, gaussian shaped, with a bandwidth

%proportionalto target size d=bsxfun(@times,c,[12]);

l1_patch_num =floor(window_sz/ cell_size);

%把window_sz缩小4倍成了62x61（MotorRolling）目标也缩小了四倍，减少计算量

% Pre-computethe Fourier Transform of the Gaussian function label

yf =fft2(gaussian_shaped_labels(output_sigma, l1_patch_num));%计算缩小后图像块每个像素点的高斯标签,

%FFT计算后，默认峰值是在图像中心，circshift把峰值移到四个角，其中左上角最大为1

% Pre-computeand cache the cosine window (for avoiding boundary discontinuity)

cos_window =hann(size(yf,1)) * hann(size(yf,2))';%余弦窗大小为62x61，每次都是在最后过滤一下，消除边界不连续

% Create videointerface for visualization

if(show_visualization)

update_visualization =show_video(img_files, video_path);

end

% Initializevariables for calculating FPS and distance precision

time = 0;

positions =zeros(numel(img_files), 2);%本函数最后返回的其中一个参数，是目标的中心位置坐标，格式[y,x]。

%rects = zeros(numel(img_files),4);

nweights = reshape(nweights,1,1,[]);

% Note:variables ending with 'f' are in the Fourier domain.

model_xf = cell(1, numLayers);%滤波器的两部分，和论文里边的A,B并不对应，详情看后边。

model_alphaf =cell(1, numLayers);

current_scale_factor=1;%当前尺度因子

%================================================================================

% Start tracking

%================================================================================

for frame = 1:numel(img_files),

im = imread([video_path img_files{frame}]);% Load the image at the currentframe

if ismatrix(im)

im = cat(3, im, im, im);

end

tic(); %计时开始

%================================================================================

%Predicting the object position from the learned object model

%================================================================================

if frame > 1

% Extracting hierarchical convolutional features，

feat = extractFeature(im, pos,window_sz, cos_window, indLayers);

% Predict position

pos = predictPosition(feat, pos, indLayers, nweights, cell_size,l1_patch_num,...

model_xf, model_alphaf);

% Scale estimation

current_scale_factor = estimate_scale(rgb2gray(im), pos, current_scale_factor);

%将当前帧图像转灰度图像，利用前一帧的尺度因子，以新位置为中心在每个尺度层级（总共33层）上进行不同大小的图像块

%尺度特征提取，这是第一次提取尺度特征（类似于前边的深度特征)，然后与前边初始化过的尺度滤波器进行卷积找最大响应

%（比如说，20层），找到最大响应所在层级对应的尺度因子，再乘以前一帧已知尺度因子得到当前帧尺度因子，再利用当前

%帧尺度因子提取特征（第二次提取特征），更新尺度滤波器。

else

init_scale_para(rgb2gray(im),target_sz, pos);

%提取尺度特征，构建尺度滤波器（例如设置尺度窗口psdding、cellsize(本文里边更像一种比例系数，恩，可以

%类比地图比例尺)、搜索窗口window_sz大小、构建尺度滤波器所需要的各种参数。

end

%================================================================================

%Learning correlation filters over hierarchical convolutional features

%================================================================================

%Extracting hierarchical convolutional features

feat = extractFeature(im, pos, window_sz, cos_window, indLayers);

%Model update

[model_xf, model_alphaf] =updateModel(feat, yf, interp_factor, lambda, frame,... %滤波器的初始化更新

model_xf, model_alphaf);

%================================================================================

%Save predicted position and timing

%================================================================================

positions(frame,:) = pos;%从第二帧起，每估计一次位置就存到这个数组里边。

target_sz_t=target_sz*current_scale_factor;%第一帧的pos和target_sz都是直接从groundtruth中拿出来的

box = [pos([2,1]) - target_sz_t([2,1])/2,target_sz_t([2,1])];%x,y,w,h，其实就是咱们估计到的位置，在图像上画框用

% rects(frame,:)=box;

time = time + toc(); % toc()计时开始

%Visualization显示图像和边界框

if show_visualization,

%box = [pos([2,1]) - target_sz([2,1])/2,target_sz([2,1])];

stop = update_visualization(frame,box);

if stop,break,end %userpressed Esc, stop early

drawnow

% pause(0.05) % uncomment to runslower

end

function pos = predictPosition(feat, pos, indLayers, nweights,cell_size, l1_patch_num,...

model_xf, model_alphaf)

% pos是上一帧的位置，feat是以pos为中心提取的特征，l1_patch_num就是那个缩小到62x61的图像块，

% model_xf, model_alphaf是滤波器参数

%================================================================================

% Computecorrelation filter responses at each layer

%================================================================================

res_layer =zeros([l1_patch_num, length(indLayers)]);

for ii = 1 : length(indLayers)

zf = fft2(feat{ii}); %特征是在window_sz缩小后的62x61图像块上，高频分量在四个角上

kzf=sum(zf .* conj(model_xf{ii}), 3) /numel(zf);

%跟论文公式相比多了个除以numel(zf)，这是归一化，高频分量在四个角上

temp= real(fftshift(ifft2(model_alphaf{ii}.* kzf))); %equation for fast detection

%做完ifft2高频分量在四个角附近，fftshif的作用就是把四个角的高频分量挪到中心

res_layer(:,:,ii)=temp/max(temp(:));

end

% Combineresponses from multiple layers (see Eqn. 5)

response =sum(bsxfun(@times, res_layer, nweights), 3);%把三个响应集合进行加权融合

%================================================================================

% Find targetlocation

%================================================================================

% Target locationis at the maximum response. we must take into

% account thefact that, if the target doesn't move, the peak

% will appearat the top-left corner, not at the center (this is

% discussedin the KCF paper). The responses wrap around cyclically.

[vert_delta,horiz_delta] = find(response == max(response(:)), 1);%找出62x61图像块上最大响应的坐标

(CF2，包含尺度变化)tracker_ensemble程序简介

vert_delta = vert_delta - floor(size(zf,1)/2);

horiz_delta =horiz_delta - floor(size(zf,2)/2);

% Map the position to the image space将最大响应点的坐标变换到整个360x640图像上，求其坐标。

pos = pos +cell_size * [vert_delta - 1, horiz_delta - 1];

end

function [model_xf, model_alphaf] = updateModel(feat, yf,interp_factor, lambda, frame,...

model_xf, model_alphaf)

numLayers =length(feat);

%================================================================================

%Initialization

%================================================================================

xf = cell(1, numLayers);

alphaf = cell(1, numLayers);

% ================================================================================

% Modelupdate

论文中滤波器更新方式（以其中一个通道d为例说明）

(CF2，包含尺度变化)tracker_ensemble程序简介

%================================================================================

for ii=1 : numLayers

xf{ii} = fft2(feat{ii});%对三个层的特征分别进行2维傅里叶变换，高频分量都集中在四个角

kf = sum(xf{ii} .* conj(xf{ii}), 3) /numel(xf{ii});%应该是归一化，反正在分母上，大家都一样。

alphaf{ii} = yf./ (kf+ lambda); %Fast training，高频分量都集中在四个角

end

% Modelinitialization or update

if frame == 1, % First frame, train with a single image

for ii=1:numLayers

model_alphaf{ii} = alphaf{ii};%滤波器模型初始化

model_xf{ii} = xf{ii};

end

else

%Online model update using learning rate interp_factor

for ii=1:numLayers

model_alphaf{ii} = (1 - interp_factor)* model_alphaf{ii} + interp_factor * alphaf{ii};

model_xf{ii} = (1 - interp_factor) * model_xf{ii} + interp_factor * xf{ii};

end

function feat =extractFeature(im, pos, window_sz, cos_window, indLayers)

%在原始图像im中提取，以pos为中心的大小为window_sz的图像块，缩放到224x224后，将该图像块通过CNN网络

%后提取indLayers三层特征，大小为62x61，并用余弦窗过滤，避免边界不连续。另外，除了第一帧只提取一次特征

%用于初始化滤波器，从第二帧开始都是提取两次特征，第一次是在上一帧的位置处提取当前帧的特征用来与滤波器

%卷积求得当前帧位置，第二次是在新的位置提取当前帧特征用来更新滤波器。

% Get thesearch window from previous detection

patch =get_subwindow(im, pos, window_sz);

% Extractinghierarchical convolutional features

feat = get_features(patch, cos_window, indLayers);%提取分层特征

end

(CF2，包含尺度变化)tracker_ensemble程序简介

相关推荐