% Compare imputation methods with simulated ground truth data. % Used to generate panels of supplemental fig. 4 % define data dimensions m = 400; % number of observations n = 20; % number of features q = 3; % ground truth dimensionality p = 5; % number of features in each correlated cluster cluster_cov = 0.5; % impart correlations to cluster by adding shared noise sim_cov = zeros(n); for i=1:q j = (i-1)*p; if i==q sim_cov(j+1:j+p,j+1:j+p) = cluster_cov/2; else sim_cov(j+1:j+p,j+1:j+p) = cluster_cov; end end sim_cov(logical(diag(ones(n,1))))=1; sim_data = mvnrnd(zeros(n,1),sim_cov,m); sim_data = zscore(sim_data); % display covariance matrix figure; ah=subplot(1,2,1); imagesc(cov(sim_data)); k = n-q*(p-1); title({sprintf('num features = %i, clusters = %i',n,q);... sprintf('effective dimensions = %i',k)}); colormap(ah,egoalley); colorbar; % plot connected components output out = decathlonConnCompDropN(sim_data); ah=subplot(1,2,2); imagesc(out); hold on; plot([k k],[0.5 size(out,1)+.5],'k--'); ylabel('features dropped'); xlabel('effective dimensionality'); title('connected components heatmap'); colormap(ah,logjet_cmap); %% test different methods of imputation g_truth = nanzscore(sim_data); frac_missing = linspace(0.1,0.5,9); nreps = 100; % mean imputation mean_impute_mse = zeros(nreps,numel(frac_missing)); for i=1:numel(frac_missing) fprintf('iteration %i of %i\n',i,numel(frac_missing)); for j=1:nreps D.data = g_truth; mask = rand(size(g_truth))