Appendix B
Matlab Programs for Contaminant Classification
This appendix contains the Matlab1 programs that were used to conduct the classification exercises described in Chapter 5 of this report.
class_init.m --initialization code
lin_class.m --code to train a linear classifier
nn_class.m --code to train a neural network classifier
class_error.m --code for error analysis
lin_predict.m --code to predict classification using the linear classifier
nn_predict.m --code to predict classification using the neural network classifier
% -------------------------------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: class_init.m
% Matlab code to initialize the classification
problem.
% Data are loaded and attributes are analyzed.
% After running this, run either lin_class.m or
nn_class.m.
% ------------------------------------------------------------------------
% Load the training data set and set up data
variables
1 |
Matlab 6 ©The MathWorks, Inc. 3 Apple Hill Drive, Natick, MA 01760–2098; http://www.mathworks.com/products/matlab/. |
S=load(‘caldata.txt’); % the name of the
calibration data file
id=S(:,1) ;
t=S(:,2) ; % class labels (target)
X=S(:,3:7) ; % attributes
fid=fopen(‘caldata_id.txt’,‘r’); % the file
containing the contaminant names
names=[] ;
for i=1:length(t)
if i==1
names=str2mat(fscanf(fid,‘%s’,1)) ;
else
names=str2mat(names, fscanf(fid,‘%s’,1)) ;
end
end
fclose(fid) ;
X1=[] ; X0=[] ;
for i=1:length(t)
if t(i)==1
X1=[X1;X(i,:)] ;
end
if t(i)==0
X0=[X0;X(i,:)] ;
end
end
aaa=size (X1) ; NT1=aaa(1) ; % The number of
contaminants with T=1
aaa=size(X0); NT0=aaa(1); % The number of
contaminants with T=0
% ----------------------------------------------------------------------
% Plot correlation analysis of attributes
figure(1)
str(1)={‘Severity’} ;
str(2)={‘Potency’} ;
str(3)={‘Prevalence’} ;
str(4)={‘Magnitude’} ;
str(5)={‘Persist/Mob’} ;
fs=12;
for i=1:5
subplot (5,5,i) ,
plot (X1 (: , i), X1 (: , 1), ‘kx’, X0 (:,i), X0 (: , 1), ‘ko’, ‘LineW
idth’, 1)
axis square
set (gca, ‘LineWidth’, 1)
text (0, 12, str(i), ‘FontSize’, fs)
if i==1
text (-3, 0, str(1), ‘Rotation’, 90, ‘FontSize’, fs)
end
end
for i=2:5
subplot (5,5,i+5) ,
plot (X1 (: , i), X1 (: , 2), ‘kx’, X0 (: , i), X0 (: , 2), ‘ko’, ‘LineW
idth’, 1)
axis square
set (gca, ‘LineWidth’, 1)
if i==2
text (-3, 0, str(2), ‘Rotation’, 90, ‘FontSize’, fs)
end
end
for i=3:5
subplot (5,5,i+10) ,
plot (X1(: , i), X1(: , 3), ‘kx’, X0(: , i), X0(: , 3), ‘ko’, ‘LineW
idth’, 1)
axis square
set (gca, ‘LineWidth’, 1)
if i==3
text (-3, 0, str(3), ‘Rotation’, 90, ‘FontSize’, fs)
end
end
for i=4:5
subplot (5,5,i+15) ,
plot (X1(: , i), X1(: , 4), ‘kx’, X0(: , i), X0(: , 4), ‘ko’, ‘LineW
idth’, 1)
axis square
set (gca, ‘LineWidth’, 1)
if i==4
text (-3, 0, str(4), ‘Rotation’, 90, ‘FontSize’, fs)
end
end
for i=5:5
subplot (5,5,i+20) ,
plot (X1(: , i), X1(: , 5), ‘kx’, X0(: , i), X0(: , 5), ‘ko’, ‘LineW
idth’, 1)
axis square
set (gca, ‘LineWidth’, 1)
if i==5
text (-3, 0, str(5), ‘Rotation’, 90, ‘FontSize’, fs)
end
end
% ---------------------------------------------------
% End of program
% ---------------------------------------------------
% ---------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: lin_class.m
% Matlab code to build a linear classifier on the
training data set.
% After this, run class_error.m and lin_predict.m.
% ------------------------------------------------------
% Linear Regression y=Xw where w is the weight
vector
Xlin=[X ones (length (t) , 1)]; % Add a column of ones
to fit bias/intercept.
X1lin=[X1 ones (NT1, 1)] ; % Add a column of ones to
fit bias/intercept.
X0lin=[X0 ones (NT0, 1)] ; % Add a column of ones to
fit bias/intercept.
w=pinv (Xlin) *t;
disp (‘The weights (five attributes plus offset)
are: ’) ;
disp (w) ;
y=Xlin*w;
y1=X1lin*w;
y0=X0lin*w;
meanse=sum( (y-t) . ^2)/length (t) ;
disp (‘The mean squared error is:’)
disp (meanse) ;
% -------------------------------------------------------
% End of program
% ---------------------------------------------------------------------
% ---------------------------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: nn_class.m
% Matlab code to build a neural network classifier on
the training data set.
% After this, run class_error.m and nn_predict.m.
% -------------------------------------------------------------------
% Set up Neural Network with two feed forward layers.
% The first is a hidden layer containing two nodes.
% The second is an output layer with a single node.
% Both layers have biases.
% The hidden layer has a hyperbolic tangent sigmoid
transfer function.
% The output layer has a linear transfer function.
% The training algorithm uses a conjugate gradient
search method.
% Network performance is measured according to the
mean of squared errors.
figure(2)
Xminmax=[1 10; 1 10; 1 10; 1 10; 1 10] ;
tranfuns={‘tansig’ ‘purelin’};
net=newff (Xminmax, [2 1] , tranfuns, ‘traincgb’,
‘learngdm’, ’mse‘) ;
net.trainParam.min_grad=1e-10;
net.trainParam.epochs=1000000;
net.trainParam.minstep=1.0e-10;
net=train(net,X’, t’);
disp (‘Input weight matrix, bias, and transfer
function in first layer’);
net.IW{1, 1}
net.b{1}
net.layers{1}.transferFcn
if net.numLayers>1
for i=2:net.numLayers
disp (‘Layer weight matrix, bias, and transfer
function for next layer’);
net.LW{i,i-1}
net.b{i}
net.layers{i}.transferFcn
end
end
y=sim(net,X’) ; y=y’ ;
y1=sim (net, X1’); y1=y1’ ;
y0=sim(net, X0’) ; y0=y0’ ;
% ----------------------------------------------------------------
% End of program
% ----------------------------------------------------------------
% ----------------------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: class_error.m
% Matlab code to determine classification error and
optimize the threshold.
% After this, run either lin_predict.m or
nn_predict.m.
% -------------------------------------------------------------------
% Classification error in training data set
minthresh=min(0, min(min(y1), min(y0)));
int=.05;
Threshrange=minthresh:int:max(max(y1) , max(y0));
idxZero=(t==0) ;
idxOne=(t>0) ;
E0=[] ; E1=[] ;
N0misclass=[] ; N1misclass=[] ; Nmisclass=[] ;
for thresh=Threshrange
classOne=(y>thresh) ;
classZero=(y<=thresh);
N0mc=sum(idxZero & classOne) ;
N1mc=sum(idxOne & classZero) ;
Nmc=N0mc+N1mc;
N0misclass=[N0misclass N0mc]; %The number of
T=0 misclassified
N1misclass=[N1misclass N1mc]; %The number of
T=1 misclassified
Nmisclass=[Nmisclass Nmc]; %The total number misclassified
e00=N0mc/sum(idxZero) ;
e11=N1mc/sum(idxOne) ;
E0=[E0 e00] ; % The fraction of T=0 contaminants that are misclassified as 1
E1=[E1 e11] ; % The fraction of T=1 contaminants that are misclassified as 0
end
figure(3)
plot (Threshrange, 100*E0, ‘ko--
’, Threshrange, 100*E1, ‘kx:’, ‘Markersize’, 8, ‘LineWidth’
, 1.5)
set (gca, ‘LineWidth’, 2, ‘fontsize’, fs);
xlabel (‘Threshold’, ‘FontSize’, fs)
ylabel (‘Classification Error (%)’, ‘FontSize’, fs)
legend (‘error for T=0 contaminants’, ‘error for T=1
contaminants’, 0)
figure(4)
plot (Threshrange, N0misclass, ‘ko--
’, Threshrange, N1misclass, ‘kx:’, Threshrange, Nmisclass,
‘k + −’, ‘Markersize’, 8, ‘LineWidth’, 1.5)
set (gca, ‘LineWidth’, 2, ‘fontsize’, fs);
xlabel (‘Threshold’, ‘FontSize’, fs)
ylabel (‘Classification Error (number that are
misclassified)’, ‘FontSize’, fs)
legend (‘number of misclassified T=0
contaminants’, ‘number of misclassified T=1
contaminants’, ‘total number of misclassified
contaminants’, 0)
% ----------------------------------------------------------------------
% Find the threshold that minimizes the total number
of misclassified contaminants
inda=find(Nmisclass==min(Nmisclass)) ;
threshes=Threshrange(inda) ;
sthreshes=size(threshes) ;
if sthreshes(2)>1 % If there are more than one
threshold values…
indb=
find(E0(inda)+E1(inda)==min(E0(inda)+E1(inda))) ;
thresh=threshes(indb); %…fine the one the minimizes the total percent error
else
thresh=threshes;
end
sthresh=size(thresh);
if sthresh(2)>1 % If there are still more than
one threshold values…
thresh=min(thresh); %…choose the smallest one.
end
disp (‘The optimal threshold is:’) ;
disp (thresh) ;
indc=find(Threshrange==thresh) ;
disp (‘The percent error in misclassifying T=1
contaminants is:’) ;
disp (100*E1(indc)) ;
disp (‘The percent error in misclassifying T=0
contaminants is:’) ;
disp (100*E0(indc)) ;
disp (‘The total number of misclassified contaminants
is:’) ;
disp (Nmisclass(indc)) ;
mis_y1=find(y1<thresh) ;
mis_y0=find(y0>thresh) ;
disp (‘Misclassified T=1 contaminants are:’) ;
for i=1:N1misclass(indc)
disp (names(mis_y1(i),:)) ;
disp ([mis_y1(i), y1(mis_y1(i))]) ;
end
disp (‘Misclassified T=0 contaminants are:’) ;
for i=1:N0misclass(indc)
disp (names(NT1+mis_y0(i),:)) ;
disp ([mis_y0 (i), y0 (mis_y0 (i))]) ;
end
% --------------------------------------------------------------
% Plot classification results as a histogram
figure(5)
fs=12;
T1col=‘w’; T0col=‘k’;
histax=Threshrange+int;
[n,xout]=hist(y1,histax) ;
bar (xout,n,.4, T1col) ;
h=findobj (gca, ‘Type’, ‘patch’) ;
set (h, ‘LineWidth’, 2)
if max(n)>30
set (gca, ‘ylim’, [0 30])
upval=num2str(max(n)) ;
text (1.025, 29, ‘\uparrow’) ;
text (1.025, 27.5, upval) ;
else
yset=max(n)+1;
set (gca, ‘Ylim’, [0 yset]) ;
end
hold on
[n,xout]=hist(y0,histax-int/2) ;
bar (xout, n,.4, T0col)
xlabel (‘ {\itY}_{\iti} ’, ‘FontSize’, fs)
ylabel (‘Number of contaminants’, ‘FontSize’, fs)
set (gca, ‘LineWidth’, 2, ‘fontsize’, fs) ;
xx=get (gca, ‘xlim’) ;
yy=get (gca, ‘ylim’) ;
line([thresh, thresh], [0,
. 9*yy (2)], ‘color’, ‘k’, ‘LineStyle’, ‘:’, ‘LineWidth’, 2) ;
ymul=.9; ymo=.1;
labxpos=xx(1)+.04*(xx(2)–xx(1)) ;
labypos=.9*yy(2) ;
boxx=[labxpos labxpos; labxpos+int/2 labxpos+int/2;
labxpos+int/2 labxpos+int/2; labxpos labxpos] ;
boxy=[ymul*labypos (ymul+ymo)*labypos; ymul*labypos
(ymul+ymo)*labypos; ymul*labypos+.05*labypos
(ymul+ymo)*labypos+.05*labypos;
ymul*labypos+.05*labypos
(ymul+ymo)*labypos+.05*labypos] ;
patch(boxx(: , 1), boxy(: , 1), T0col)
patch (boxx(: , 2), boxy(: , 2), T1col, ‘linewidth’, 2)
text (labxpos+int, (ymul+ymo)*labypos,
‘T=1’, ‘Verticalalignment’, ‘bottom’, ‘fontsize’, fs)
text (labxpos+int, ymul*labypos,
‘T=0’, ‘Verticalalignment’, ‘bottom’, ‘fontsize’, fs)
strg(1)={‘Classifier \rightarrow’} ;
strg(2)={‘Threshold ’} ;
text (thresh,. 6*yy(2), strg(1), ‘horizontalalignment’, ‘r
ight’, ‘Fontsize’, fs)
text (thresh,. 53*yy(2), strg(2), ‘horizontalalignment’, ‘
right’, ‘Fontsize’, fs)
hold off
% ------------------------------------------------------------------
% End of program
% ------------------------------------------------------------------
% ------------------------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: lin_predict.m
% Matlab code to predict classification for test
cases using linear classifier.
% Run this after running lin_class.m and
class_error.m.
% ------------------------------------------------------------------
% Prediction for test cases
SP=load (‘testdata.txt’) ; % the name of the data
file containing test cases
idP=SP(: , 1) ;
XP=SP(: , 2:6) ;
XP=[XP ones(length(idP), 1)] ;
YP=XP*w;
disp (‘The predicted values for the test cases are:’) ;
for i=1:length(idP)
disp ([idP(i), YP(i)]) ;
end
% -------------------------------------------------------------------
% End of program
% -------------------------------------------------------------------
% -------------------------------------------------------------------
% NRC Committee on Drinking Water Contaminants
% Filename: nn_predict.m
% Matlab code to predict classification for test
cases using neural network classifier.
% Run this after running nn_class.m and
class_error.m.
% -------------------------------------------------------------------
% Prediction for test cases
SP=load (‘testdata.txt’) ; % the name of the data
file containing test cases
idP=SP(: , 1) ;
XP=SP(: , 2:6) ;
YP=sim (net, XP’) ;
disp (‘The predicted values for the test cases are:’) ;
for i=1:length(idP)
disp ([idP(i) , YP(i)]) ;
end
% ------------------------------------------------------------------
% End of program
% ------------------------------------------------------------------