Friday, July 1, 2011

neat matlab program I wrote

%pcabio.m

% to compute and graph PCA for stand biometrics, as well as measure distances between points!
% this file has the 10 stand metrics currently measured. The first row is
% numeric. The first column is plot names.

%------------------------------------------------------------------------
% names of files to load-- comment in what you want
%load PCAEX80.dat;
load PCABIO1984.dat
%load PCABIO2007.dat;

format short g

% creation of the file headers and column numbers. Different columns in
% early file due to no ANPP

%For 2007
% fid = fopen('PCABIO2007.dat');
% C = textscan(fid,'%s %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32');
% fclose(fid);

%for 1984
fid = fopen('PCABIO1984.dat');
C = textscan(fid,'%s %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32');
fclose(fid);

%For files that are 80
%  fid = fopen('PCAEX80.dat');
%  C = textscan(fid,'%s %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32 %f32');
%  fclose(fid);

% these are the headings that correspond to the stand metrics, in order,

%2007
%headers = {'HARDWOOD' ,'BIO' ,'PERHARD','ANPP', 'BASAL', 'NUMT', 'STEMD', 'SHANNON', 'ASSAYNO','ASSAYBIO','NUMHW','PERNUMHW','STEMDHW','PERSTEMDHW'};

%1984
headers = {'HARDWOOD' ,'BIO' ,'PERHARD','ANPP', 'BASAL', 'NUMT', 'STEMD', 'ASSAYNO','ASSAYBIO','NUMHW','PERNUMHW','STEMDHW','PERSTEMDHW','SHANNON'};

% 1980
%headers = {'HARDWOOD' ,'BIO' ,'PERHARD','BASAL', 'NUMT', 'STEMD', 'ASSAYNO','ASSAYBIO','SHANNON','NUMHW','PERHWNUM','STEMDHW','PERSTEMDHW'};

% create a matrix that does not include the first row of PCAEX07 because it
% is text for the plot numbers (see above string in %s)

%X = PCAEX80(:,2:14);
X = PCABIO1984(:,2:15);
%X = PCABIO2007(:,2:15);

%------------------------------------------CONDUCTING PCA!!!------------

% if there are no NaN's!

[PC, scores, latent] = princomp(zscore(X));

% if NaN's are included, you must centralize and standardize each row
% individually, so comment in this code!

% % subtract out the mean
% for i = 1:length(X)
% thisRow = X(i,:);
% thisRow(isnan(thisRow)) = [];
% stdX(i) = std(thisRow);
% meanX(i) = mean(thisRow);
% centerX = X - repmat(meanX,10,1);
% centerStandardX = centerX./repmat(stdX,10,1);
% end


% % conduct principle components analysis
% [PC, scores, latent] = princomp(centerStandardX);

%----------------DISPLAYING THE CUMULATIVE OUTPUT OF THE VARIANCE!---

account = cumsum(latent)./sum(latent);
disp(account);

% name eigenvectors to the vectors that you are about.
vect1 = scores(:,1);
vect2 = scores(:,2);

% plot the principle components (FACTOR ANALYSIS PLOT) with the plot
% numbers-- must make sure these are STRINGS and not FLOATS/INTS
figure(1)
plot(vect1,vect2,'b.')
xlabel('1st Principal Component')
ylabel('2nd Principal Component')
title('Principal Components of Stand Biometerics')
text(vect1+0.1, vect2, C{1}, 'FontSize',8);

% plot a scree plot of cumulative variance explained
figure(2)
pareto(latent)
xlabel('Principal Component')
ylabel('Variance Explained (%)')
title('Scree plot of variance explained and CDF')

% plot a biplot for the first two axes... can be done for 3 as well, if
% needed, but is hard to interpret in my opinion. if disp(account)'s second
% entry is bigger than about 0.55, I don't do it.

figure(3)
biplot(PC(:,1:2), 'scores',scores(:,1:2),'varlabels',headers')


% for finding the distance between the sets of points and printing them out
% by labels! Will be used later in geostats

b = zeros(length(vect1),2);
for i = 1: length(vect1)
    b(i,1) = vect1(i);
    b(i,2) = vect2(i);
end

k = 0;
for i = 1:length(b)
    for j = 1:length(b)
        if j~=i
            k=k+1;
            index(k,:) = [i j];
        end;
    end;
end;

dist = ((b(index(:,2),1)-b(index(:,1),1)).^2 + (b(index(:,2),2)-b(index(:,2),2)).^2).^0.5;

% if you want to print it to the scree-- WARNING IT WILL OVERLOAD YOUR
% MEMORY IF YOU ARE RUNNING MUCH ELSE!

% fprintf(' #1 #2 distance\n');
% fprintf('%3d %3d %10.6f\n', ...
%    [index(:,1),index(:,2),dist].');
% fprintf('Total %11.6f\n',sum(dist));

m = [index(:,1),index(:,2),dist];

% output the distance matrix information to a file that saves in the Matlab
% directory. I think this format will be adaptable well to SPSS, but it
% would be better for Matlab to have in matrix form!

%dlmwrite('biometric_distances_80.csv',m,',');
dlmwrite('biometric_distances_804.csv',m,',');
%dlmwrite('biometric_distances_07.csv',m,',');

No comments:

Post a Comment