A HMM for local alignment is shown at
http://books.google.com/books?id=R5P2GlJvigQC&pg=PA86
EDIT It's been 7 or 8 years since I really understood this stuff, but I have some old MATLAB code that implements Smith-Waterman. Since a cursory search doesn't show any such code, I figured I'd post it here. Although I don't think this is what you're really asking for, perhaps it will help you (or someone else).
function y=smithwatbl50lin(D1,D2)
% Smith-Waterman alignment of two ssDNAs
% (OR RNAS: U is equivalent to T here)
% with 5-3 ([uppercase] char) strand D*
% Uses the BLOSUM50 matrix and linear gap penalty.
% Alignment is done randomly (in that
% the traceback goes uniformly at random
% when it's got more than one possibility)
% As usual with my code, there's little room for error...
D1(find(D1=='U'))='T';
D2(find(D2=='U'))='T';
% BLOSUM50 matrix (use symmetry to minimize number of entries typed by hand)
bl50(1,1:20) = [5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0];
bl50(2,2:20) = [7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3];
bl50(3,3:20) = [7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3];
bl50(4,4:20) = [8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4];
bl50(5,5:20) = [13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1];
bl50(6,6:20) = [7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3];
bl50(7,7:20) = [6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3];
bl50(8,8:20) = [8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4];
bl50(9,9:20) = [10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4];
bl50(10,10:20)=[5 2 -3 2 0 -3 -3 -1 -3 -1 4];
bl50(11,11:20)=[5 -3 3 1 -4 -3 -1 -2 -1 1];
bl50(12,12:20)=[6 -2 -4 -1 0 -1 -3 -2 -3];
bl50(13,13:20)=[7 0 -3 -2 -1 -1 0 1];
bl50(14,14:20)=[8 -4 3 -2 1 4 -1];
bl50(15,15:20)=[10 -1 -1 -4 -3 -3];
bl50(16,16:20)=[5 2 -4 -2 -2];
bl50(17,17:20)=[5 -3 -2 0];
bl50(18,18:20)=[15 2 -3];
bl50(19,19:20)=[8 -1];
bl50(20,20)=5;
bl50=bl50+triu(bl50,1)';
% linear gap penalty
d=8;
% get integral character arrays
E1(D1=='A')='0';
E1(D1=='C')='1';
E1(D1=='G')='2';
E1(D1=='T')='3';
E2(D2=='A')='0';
E2(D2=='C')='1';
E2(D2=='G')='2';
E2(D2=='T')='3';
r=length(E1);
c=length(E2);
% translate to codons, dropping any possible end garbage
E1=E1(1:3*floor(r/3));
E2=E2(1:3*floor(c/3));
% Amino acid hash assigments
% A R N D C Q E G H I L K M F P S T W Y V
% 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
% Term --> 21
% List codons in alphabetical order (which equates to
% setting A=0, C=1, G=2, T=3 as we do above and using numerical order)
% and then hash to get
aha=[12 3 12 3 17 17 17 17 2 16 2 16 10 10 13 10];
ahc=[6 9 6 9 15 15 15 15 2 2 2 2 11 11 11 11];
ahg=[7 4 7 4 1 1 1 1 8 8 8 8 20 20 20 20];
aht=[21 19 21 19 16 16 16 16 21 5 18 5 11 14 11 14];
aminohash=[aha ahc ahg aht];
% NB. This will keep us from using zillions (ok, 64) if statements
% Now we're making polypeptides, terminating at the first 21 found
for codon=1:floor(r/3)
ah=aminohash( base2dec(E1(3*(codon-1)+1:3*codon),4) + 1);
if ah==21
break;
end
PP1(codon)=ah;
end
for codon=1:floor(c/3)
ah=aminohash( base2dec(E2(3*(codon-1)+1:3*codon),4) + 1);
if ah==21
break;
end
PP2(codon)=ah;
end
LPP1=length(PP1);
LPP2=length(PP2);
F=zeros(LPP1+1,LPP2+1);
traceback=zeros(LPP1+1,LPP2+1);
% raster-fill F...
for i=2:LPP1+1
for j=2:LPP2+1
temp1=F(i,j-1)-d; % left
temp2=F(i-1,j)-d; % up
temp3=F(i-1,j-1)+bl50(PP1(i-1),PP2(j-1)); % left and up
temp{i,j}=[0 temp1 temp2 temp3];
F(i,j)=max(temp{i,j});
temp4=find(temp{i,j}==F(i,j));
traceback(i,j)=temp4(ceil(rand*length(temp4)));
clear temp4;
end
end
iah='ARNDCQEGHILKMFPSTWYV'; % inverse amino hash string
% Find the biggest score entry. If several than pick one uniformly at random
mf=max(max(F));
tempf=find(F==mf);
[mfi,mfj]=ind2sub(size(F),tempf(ceil(rand*length(tempf))));
% in the end we'll flip alignment--REMEMBER!
backtrace=[mfi,mfj];
if F(backtrace(1),backtrace(2))==0
'no local alignment'
break;
elseif traceback(backtrace(1),backtrace(2))-1==3
alignment(:,1)=[iah(PP1(backtrace(1)-1));' ';iah(PP2(backtrace(2)-1))];
backtrace=backtrace-[1 1];
elseif traceback(backtrace(1),backtrace(2))-1==2
alignment(:,1)=[iah(PP1(backtrace(1)-1));' ';'-'];
backtrace=backtrace-[1 0];
elseif traceback(backtrace(1),backtrace(2))-1==1
alignment(:,1)=['-';' ';iah(PP2(backtrace(2)-1))];
backtrace=backtrace-[0 1];
end
k=1;
while max(backtrace>1)
k=k+1;
if F(backtrace(1),backtrace(2))==0
break;
elseif traceback(backtrace(1),backtrace(2))-1==3
alignment(:,k)=[iah(PP1(backtrace(1)-1));' ';iah(PP2(backtrace(2)-1))];
backtrace=backtrace-[1 1];
elseif traceback(backtrace(1),backtrace(2))-1==2
alignment(:,k)=[iah(PP1(backtrace(1)-1));' ';'-'];
backtrace=backtrace-[1 0];
elseif traceback(backtrace(1),backtrace(2))-1==1
alignment(:,k)=['-';' ';iah(PP2(backtrace(2)-1))];
backtrace=backtrace-[0 1];
else
break;
end
end
alignment=fliplr(alignment);
la=size(alignment,2);
for i=1:la
if alignment(1,i)==alignment(3,i)
alignment(2,i)=alignment(1,i);
elseif bl50(min(find(iah==alignment(1,i))),min(find(iah==alignment(3,i))))>0
alignment(2,i)='+';
else
alignment(2,i)=' ';
end
end
y=alignment;
No comments:
Post a Comment