(CF2,包含尺度变化)tracker_ensemble程序简介
%tracker_ensemble: Correlation filter tracking with convolutional features
% Input:
% - video_path: path to the image sequence
% - img_files: list of image names
% - pos: intialized center position ofthe target in (row, col)
% - target_sz: intialized target size in (Height,Width)
% - padding: padding parameter for the searcharea
% - lambda: regularization term for ridgeregression
% - output_sigma_factor: spatial bandwidth forthe Gaussian label
% - interp_factor: learning rate for model update
% - cell_size: spatial quantization level
% - show_visualization: set to True for showing intermediate results
% Output:
% - positions: predicted target position at each frame
% - time: time spent for tracking
%
function [positions, time] = tracker_ensemble(video_path,img_files, pos, target_sz,...
padding, lambda, output_sigma_factor,interp_factor, cell_size, show_visualization)
%================================================================================
% Environment setting环境变量设置
%================================================================================
indLayers = [37,28, 19]; % relu5-4, relu4-4, andrelu3-4 in VGG Net
nweights = [1, 0.5, 0.25]; % Weights for combining correlation filter responses
numLayers =length(indLayers);
% Get imagesize and search window size
im_sz = size(imread([video_path img_files{1}]));
window_sz =get_search_window(target_sz, im_sz, padding);%以目标为中心,padding一圈得到搜索窗口
% Compute the sigma for the Gaussian function label 高斯核带宽
output_sigma =sqrt(prod(target_sz)) * output_sigma_factor / cell_size;
%createregression labels, gaussian shaped, with a bandwidth
%proportionalto target size d=bsxfun(@times,c,[12]);
l1_patch_num =floor(window_sz/ cell_size);
%把window_sz缩小4倍成了62x61(MotorRolling)目标也缩小了四倍,减少计算量
% Pre-computethe Fourier Transform of the Gaussian function label
yf =fft2(gaussian_shaped_labels(output_sigma, l1_patch_num));%计算缩小后图像块每个像素点的高斯标签,
%FFT计算后,默认峰值是在图像中心,circshift把峰值移到四个角,其中左上角最大为1
% Pre-computeand cache the cosine window (for avoiding boundary discontinuity)
cos_window =hann(size(yf,1)) * hann(size(yf,2))';%余弦窗大小为62x61,每次都是在最后过滤一下,消除边界不连续
% Create videointerface for visualization
if(show_visualization)
update_visualization =show_video(img_files, video_path);
end
% Initializevariables for calculating FPS and distance precision
time = 0;
positions =zeros(numel(img_files), 2);%本函数最后返回的其中一个参数,是目标的中心位置坐标,格式[y,x]。
%rects = zeros(numel(img_files),4);
nweights = reshape(nweights,1,1,[]);
% Note:variables ending with 'f' are in the Fourier domain.
model_xf = cell(1, numLayers);%滤波器的两部分,和论文里边的A,B并不对应,详情看后边。
model_alphaf =cell(1, numLayers);
current_scale_factor=1;%当前尺度因子
%================================================================================
% Start tracking
%================================================================================
for frame = 1:numel(img_files),
im = imread([video_path img_files{frame}]);% Load the image at the currentframe
if ismatrix(im)
im = cat(3, im, im, im);
end
tic(); %计时开始
%================================================================================
%Predicting the object position from the learned object model
%================================================================================
if frame > 1
% Extracting hierarchical convolutional features,
feat = extractFeature(im, pos,window_sz, cos_window, indLayers);
% Predict position
pos = predictPosition(feat, pos, indLayers, nweights, cell_size,l1_patch_num,...
model_xf, model_alphaf);
% Scale estimation
current_scale_factor = estimate_scale(rgb2gray(im), pos, current_scale_factor);
%将当前帧图像转灰度图像,利用前一帧的尺度因子,以新位置为中心在每个尺度层级(总共33层)上进行不同大小的图像块
%尺度特征提取,这是第一次提取尺度特征(类似于前边的深度特征),然后与前边初始化过的尺度滤波器进行卷积找最大响应
%(比如说,20层),找到最大响应所在层级对应的尺度因子,再乘以前一帧已知尺度因子得到当前帧尺度因子,再利用当前
%帧尺度因子提取特征(第二次提取特征),更新尺度滤波器。
else
init_scale_para(rgb2gray(im),target_sz, pos);
%提取尺度特征,构建尺度滤波器(例如设置尺度窗口psdding、cellsize(本文里边更像一种比例系数,恩,可以
%类比地图比例尺)、搜索窗口window_sz大小、构建尺度滤波器所需要的各种参数。
end
%================================================================================
%Learning correlation filters over hierarchical convolutional features
%================================================================================
%Extracting hierarchical convolutional features
feat = extractFeature(im, pos, window_sz, cos_window, indLayers);
%Model update
[model_xf, model_alphaf] =updateModel(feat, yf, interp_factor, lambda, frame,... %滤波器的初始化更新
model_xf, model_alphaf);
%================================================================================
%Save predicted position and timing
%================================================================================
positions(frame,:) = pos;%从第二帧起,每估计一次位置就存到这个数组里边。
target_sz_t=target_sz*current_scale_factor;%第一帧的pos和target_sz都是直接从groundtruth中拿出来的
box = [pos([2,1]) - target_sz_t([2,1])/2,target_sz_t([2,1])];%x,y,w,h,其实就是咱们估计到的位置,在图像上画框用
% rects(frame,:)=box;
time = time + toc(); % toc()计时开始
%Visualization显示图像和边界框
if show_visualization,
%box = [pos([2,1]) - target_sz([2,1])/2,target_sz([2,1])];
stop = update_visualization(frame,box);
if stop,break,end %userpressed Esc, stop early
drawnow
% pause(0.05) % uncomment to runslower
end
end
end
function pos = predictPosition(feat, pos, indLayers, nweights,cell_size, l1_patch_num,...
model_xf, model_alphaf)
% pos是上一帧的位置,feat是以pos为中心提取的特征,l1_patch_num就是那个缩小到62x61的图像块,
% model_xf, model_alphaf是滤波器参数
%================================================================================
% Computecorrelation filter responses at each layer
%================================================================================
res_layer =zeros([l1_patch_num, length(indLayers)]);
for ii = 1 : length(indLayers)
zf = fft2(feat{ii}); %特征是在window_sz缩小后的62x61图像块上,高频分量在四个角上
kzf=sum(zf .* conj(model_xf{ii}), 3) /numel(zf);
%跟论文公式相比多了个除以numel(zf),这是归一化,高频分量在四个角上
temp= real(fftshift(ifft2(model_alphaf{ii}.* kzf))); %equation for fast detection
%做完ifft2高频分量在四个角附近,fftshif的作用就是把四个角的高频分量挪到中心
res_layer(:,:,ii)=temp/max(temp(:));
end
% Combineresponses from multiple layers (see Eqn. 5)
response =sum(bsxfun(@times, res_layer, nweights), 3);%把三个响应集合进行加权融合
%================================================================================
% Find targetlocation
%================================================================================
% Target locationis at the maximum response. we must take into
% account thefact that, if the target doesn't move, the peak
% will appearat the top-left corner, not at the center (this is
% discussedin the KCF paper). The responses wrap around cyclically.
[vert_delta,horiz_delta] = find(response == max(response(:)), 1);%找出62x61图像块上最大响应的坐标
vert_delta = vert_delta - floor(size(zf,1)/2);
horiz_delta =horiz_delta - floor(size(zf,2)/2);
% Map the position to the image space将最大响应点的坐标变换到整个360x640图像上,求其坐标。
pos = pos +cell_size * [vert_delta - 1, horiz_delta - 1];
end
function [model_xf, model_alphaf] = updateModel(feat, yf,interp_factor, lambda, frame,...
model_xf, model_alphaf)
numLayers =length(feat);
%================================================================================
%Initialization
%================================================================================
xf = cell(1, numLayers);
alphaf = cell(1, numLayers);
% ================================================================================
% Modelupdate
论文中滤波器更新方式(以其中一个通道d为例说明)
%================================================================================
for ii=1 : numLayers
xf{ii} = fft2(feat{ii});%对三个层的特征分别进行2维傅里叶变换,高频分量都集中在四个角
kf = sum(xf{ii} .* conj(xf{ii}), 3) /numel(xf{ii});%应该是归一化,反正在分母上,大家都一样。
alphaf{ii} = yf./ (kf+ lambda); %Fast training,高频分量都集中在四个角
end
% Modelinitialization or update
if frame == 1, % First frame, train with a single image
for ii=1:numLayers
model_alphaf{ii} = alphaf{ii};%滤波器模型初始化
model_xf{ii} = xf{ii};
end
else
%Online model update using learning rate interp_factor
for ii=1:numLayers
model_alphaf{ii} = (1 - interp_factor)* model_alphaf{ii} + interp_factor * alphaf{ii};
model_xf{ii} = (1 - interp_factor) * model_xf{ii} + interp_factor * xf{ii};
end
end
end
function feat =extractFeature(im, pos, window_sz, cos_window, indLayers)
%在原始图像im中提取,以pos为中心的大小为window_sz的图像块,缩放到224x224后,将该图像块通过CNN网络
%后提取indLayers三层特征,大小为62x61,并用余弦窗过滤,避免边界不连续。 另外,除了第一帧只提取一次特征
%用于初始化滤波器,从第二帧开始都是提取两次特征,第一次是在上一帧的位置处提取当前帧的特征用来与滤波器
%卷积求得当前帧位置,第二次是在新的位置提取当前帧特征用来更新滤波器。
% Get thesearch window from previous detection
patch =get_subwindow(im, pos, window_sz);
% Extractinghierarchical convolutional features
feat = get_features(patch, cos_window, indLayers);%提取分层特征
end