以前的程序,是c++ 写的。原来的目的是,两个DNA 序列是有不同的,需要找出一种
匹配,使得分数最高。错配、缺失都要扣分。我想,跟daniel_zan的要求差不多。
以下是子程序align1.c
#include<stdlib.h>
#include<fstream.h>
#include<assert.h>
#define SHOW_BLANK
const MAX_LENGTH = 50000;
//Used in show alignment
const LINE_LENGTH = 60;
const NAME_LENGTH = 10;
//Alignment score
const MATCH = 2;
const MISMATCH = -6;
const EXTEND = 6;
static int Inited = 0;
static int V[128][128];
static int ICC[MAX_LENGTH],IRR[MAX_LENGTH];// saving matrix scores
static int StartA,EndA,StartB,EndB;// Alignment region
static int *sapp
// Current script append ptr
static int last
// Last script op appended
static int s_end
// Operation region
static int al_len
// Alignment length
#define gap(k) ((k) <= 0 ? 0 : EXTEND * (k)) // k-symbol indel cost
// Append "Delete k" op
#define DEL(k) /
{ /
al_len += k
/
if (last < 0){ /
last = sapp[-1] -= (int)(k)
/
}else{ /
last = *(sapp++) = -(int)(k)
/
s_end++
/
} /
}
// Append "Insert k" op
#define INS(k) /
{ /
al_len += k
/
if (last > 0){ /
last = sapp[-1] += (int)(k)
/
}else{ /
last = *(sapp++) = (int)(k)
/
s_end++
/
} /
}
// Append "Replace" op
#define REP /
{ last = *(sapp++) = 0
/
s_end++
/
al_len ++
/
}
void InitV(void)
{
for(int i = 0;i < 128;i++){
for(int j = 0;j < 128;j ++){
V[j] = MISMATCH;
if(i == j) V[j] = MATCH;
if(i == 'N' || j == 'N')V[j] = MISMATCH + 1;
}
}
Inited = 1;
}
//Find the alignment region between sequence A and sequence B
//If return zero,means no alignment region;
//else return alignment score
//Length of sequence A is M,length of sequence B is N
//The answer is recorded in StartA,EndA,StartB,EndB
//Then true alignment will be computed in this region
int FindRegion1(char *A,char *B,int M,int N)
{
int i,j
// row and column indices
int c
// best score at current point
int f
// best score ending with insertion
int d
// best score ending with deletion
int p
// best score at (i-1, j-1)
int ci
// end-point associated with c
int pi
// end-point associated with p
int Score = 0
// max score
int *va;
// Initialize the 0 th row
for ( j = 1
j <= N
j++ ){
ICC[j] = 0;
IRR[j] = -j;
}
for ( i = 1
i <= M
i++ ){
p = 0;
pi = i - 1;
ICC[0] = 0
// Initialize column 0
IRR[0] = i;
va = V[A[i-1]];
for ( j = 1
j <= N
j++ ){
f = ICC[j-1] - EXTEND;
d = ICC[j] - EXTEND;
c = p + va[B[j-1]]
// replace A[i-1] with B[j-1]
if ( c < d ){
c = d;
ci = IRR[j];
}else if ( c < f ){
c = f;
ci = IRR[j-1];
}else {
ci = pi;
}
p = ICC[j];
pi = IRR[j];
ICC[j] = c;
IRR[j] = ci;
}
if ( c > Score ){
Score = c;
EndA = i;
EndB = N;
StartA = ci;
}
}
for(j = 0;j <= N;j ++){
if(ICC[j] > Score ){
Score = ICC[j];
EndA = M;
EndB = j;
StartA = IRR[j];
}
}
if ( Score ){
if ( StartA < 0 ){
StartB = - StartA;
StartA = 0;
}else {
StartB = 0;
}
}
return(Score);
}
//Now alignment sequence A &
sequence B and record the alignment path
int Align1(char A[],char B[],int M,int N)
{
int midi,midj,midc,*va;
int i,j,c,e,d,s,t;
if (N <= 0){
if (M > 0) DEL(M)
return -gap(M);
}
if (M <= 1){
if (M <= 0){
INS(N)
return -gap(N);
}
midc = - gap(N+1);
midj = 0;
va = V[A[0]];
for (j = 1
j <= N
j++){
c = -gap(j-1) + va[B[j-1]] - gap(N-j);
if (c > midc){
midc = c;
midj = j;
}
}
if (midj == 0){
DEL(1)
INS(N)
}else {
if (midj > 1) INS(midj-1)
REP
if (midj < N) INS(N-midj)
}
return midc;
}
midi = M / 2;
t = EXTEND;
for(i = 0;i <= N;i++){
ICC = t = t - EXTEND;
}
t = 0;
for(i = 1;i <= midi;i++){
s = ICC[0];
ICC[0] = t = t - EXTEND;
va = V[A[i-1]];
for(j = 1;j <= N;j++){
c = s + va[B[j-1]]
//change
e = ICC[j-1] - EXTEND
//insert
d = ICC[j] - EXTEND
//delete
s = ICC[j]
//keep old value
if((c > e) &&
(c > d)){
ICC[j] = c;
}else if(e > d){
ICC[j] = e;
}else {
ICC[j] = d;
}
}
}
t = EXTEND;
for (j = N
j >= 0
j--){
IRR[j] = t = t - EXTEND;
}
t = 0;
for (i = M-1
i >= midi
i--){
s = IRR[N];
IRR[N] = t = t - EXTEND;
va = V[A];
for (j = N-1
j >= 0
j--){
c = s + va[B[j]]
//change
e = IRR[j+1] - EXTEND
//insert
d = IRR[j] - EXTEND
//delete
s = IRR[j]
//keep old value
if((c > e) &&
(c > d)){
IRR[j] = c;
}else if(e >= d){
IRR[j] = e;
}else {
IRR[j] = d;
}
}
}
midc = ICC[0]+IRR[0];
midj = 0;
for (j = 0
j <= N
j++){
if ((ICC[j] + IRR[j]) > midc){
midc = ICC[j] + IRR[j];
midj = j;
}
}
(void)Align1(A,B,midi,midj);
(void)Align1(A+midi,B+midj,M-midi,N-midj);
return midc;
}
//the main program call this to do alignment
//Alignment path will be recorded in argument *s
//Argument end means length of argument *s
//Argument len meana length of alignment
int ALIGN1(char A[],char B[],int M,int N,int **s,int *end,int *len)
{
int score;
if(NULL != *s)delete[] *s;
if((sapp = *s = new int[M+N]) == NULL){
cerr << "Memory out!" << endl;
cin.get();
exit(-1);
}
s_end = 0;
last = 0;
al_len = 0;
if(!Inited) InitV();//Initialize alignment score array
score = FindRegion1(A,B,M,N);
if(!score){
delete[] *s;
*s = NULL;
return(0);
}
if(StartA) DEL(StartA)
if(StartB) INS(StartB)
score = Align1(A+StartA,B+StartB,EndA - StartA,EndB - StartB);
if(EndA < M) DEL(M - EndA)
if(EndB < N) INS(N - EndB)
*end = s_end;
*len = al_len;
return(score);
}
void ShowAlign(ostream *fout,char A[],char B[],char NameA[],char NameB[],int MM,int NN,int *ss,int end,char orientA,char orientB)
{
char (*c)[3];
char *cc;
if((cc = new char[3 * (MM + NN)]) == NULL){
cerr << "Memory out !" << endl
cin.get();
exit(-1);
}
c = (char (*)[3])cc;
int i,j,*s;
int aa,bb,k;
int AllNumber = 0,ErrorNumber = 0;
s = ss;
aa = bb = k = 0;
*fout << "Head is " << s[0] << endl;
*fout << "End is " << s[end - 1] << endl;
if(s[0] > 0){
for(bb = 0;bb < s[0];bb++){
NULL;
#ifdef SHOW_BLANK
c[k][0] = ' ';
c[k][1] = ' ';
c[k][2] = B[bb];
k ++;
#endif
}
}else if(s[0] < 0){
for(aa = 0;aa < -s[0];aa++){
NULL;
#ifdef SHOW_BLANK
c[k][0] = A[aa];
c[k][1] = ' ';
c[k][2] = ' ';
k ++;
#endif
}
}else {
AllNumber ++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]){
c[k][1] = '|';
}else {
c[k][1] = ' ';
ErrorNumber++;
}
aa++;
bb++;
k++;
}
for(i = 1;i < end - 1;i++){
if(s > 0){
AllNumber += s;
ErrorNumber += s;
for(int ii = 0;ii < s;ii++){
c[k][0] = '-';
c[k][2] = B[bb++];
c[k++][1] = ' ';
}
}else if(s < 0){
AllNumber -= s;
ErrorNumber -= s;
for(int ii = 0;ii < -s;ii++){
c[k][0] = A[aa++];
c[k][2] = '-';
c[k++][1] = ' ';
}
}else {
AllNumber++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]) c[k][1] = '|';
else {
c[k][1] = ' ';
ErrorNumber ++;
}
aa++;
bb++;
k++;
}
}
if(s[end - 1] == 0){
AllNumber++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]) c[k][1] = '|';
else {
c[k][1] = ' ';
ErrorNumber ++;
}
k++;
}
#ifdef SHOW_BLANK
else if(s[end - 1] > 0){
for(int ii = 0;ii < s[end - 1];ii ++){
c[k][0] = ' ';
c[k][1] = ' ';
c[k++][2] = B[bb++];
}
}else {
for(int ii = 0;ii < -s[end - 1];ii++){
c[k][0] = A[aa++];
c[k][1] = ' ';
c[k++][2] = ' ';
}
}
#endif
*fout << "dErrorRate = " << (double)ErrorNumber / (double)AllNumber << endl;
*fout << endl;
for(i = 0;i < k;i += LINE_LENGTH){
int ii;
for(ii = 0;ii < NAME_LENGTH;ii++){
if(NameA[ii] != 0)
*fout << NameA[ii];
else {
for(NULL;ii < NAME_LENGTH;ii++) *fout << ' ';
break;
}
}
*fout << orientA;
*fout << " ";
for(j = i;j < i+LINE_LENGTH &&
j < k;j++)*fout << c[j][0];
*fout << endl;
for(j = 0;j < NAME_LENGTH+3;j++) *fout << ' ';
for(j = i;j < i+LINE_LENGTH &&
j < k;j++)*fout << c[j][1];
*fout << endl;
for(ii = 0;ii < NAME_LENGTH;ii++){
if(NameB[ii] != 0)
*fout << NameB[ii];
else {
for(NULL;ii < NAME_LENGTH;ii++) *fout << ' ';
break;
}
}
*fout << orientB;
*fout << " ";
for(j = i;j < i+LINE_LENGTH &&
j < k;j++)*fout << c[j][2];
*fout << endl << endl;
}
*fout << endl;
return;
}
以下是主程序,也就是一个小例子:
#include<iostream.h>
extern int ALIGN1(char A[],char B[],int M,int N,int **s,int *end,int *len);
extern void ShowAlign(ostream *fout,char A[],char B[],char NameA[],char NameB[],int MM,int NN,int *ss,int end,char orientA,char orientB);
void main(int argc, char *argv[])
{
char NameA[] = "435-seq";
int M = 546;
char A[] =
"GCACTGTCCCTCTCCATTAAATGAAGTGCTTGGATGGATTGCTATAAGGCCTGTCTGGCTCGGAGGCTTGGTGCCTCAACTCATTGCCTGCTGGTCCAAGGAAATCGAGTGCCTGAGCCAGAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTTCCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCGTTCTCTGCTGCTGAC
ACTCAGCCCCTTCCCAGCTTCCAGTTGGATACAGGACCTGGGCCAGGAGAGCAGGGAGGACACTGTGGAAATGCGGCCAGGCCATCAGGGGCCTCGCAGCAGGGGACTCGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCTGCCTGTGGTCTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCCTGGCCTGGGAGATATTTTTGCTGCACTT
TGGAGCCACCCCGCCCCCTGGATACTCAGACCCTGCACA";
char NameB[] = "372-seq";
int N = 565;
char B[] =
"CTGGTCCAAGGAAATCAGTGCCTGAGCCAGAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTTCCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCCTTCTCTGCTGCTGACATCAGCCCCTTCCCAGCTTCCAGTTTGATACAGGACCTGGGCCAGGAGAGCAGGGAGGACACGTGGAAATGCGGCCAGGCCATCCAGGGGC
CTCGCAGCAGGGGACTGGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCTGCCTGTGGTTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCCTGGCCTGGGAGATATTTTTGCTGCACTTTGAGCCACCCCGCCCCCTGGAACTCAGACCCTGCACAGTCCATGCCATAACAATGACGACCACTTCCAATTGTTTCCTAGCTGGAGAGGCGGG
GAGGGGAGCACTGTTTGGGAAGGGGGGGAGCCTGGGGGAAATGCTCCTAGTGACAACA";
int *s,s_end,length;
ALIGN1(A,B,M,N,&s,&s_end,&length);
ShowAlign(&cout,A,B,NameA,NameB,M,N,s,s_end,'+','+');
}
运行结果如下:
Head is -90
End is 114
dErrorRate = 0.0218818
435-seq + GCACTGTCCCTCTCCATTAAATGAAGTGCTTGGATGGATTGCTATAAGGCCTGTCTGGCT
372-seq +
435-seq + CGGAGGCTTGGTGCCTCAACTCATTGCCTGCTGGTCCAAGGAAATCGAGTGCCTGAGCCA
|||||||||||||||| |||||||||||||
372-seq + CTGGTCCAAGGAAATC-AGTGCCTGAGCCA
435-seq + GAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTT
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + GAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTT
435-seq + CCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCGTT
||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||
372-seq + CCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCCTT
435-seq + CTCTGCTGCTGACACTCAGCCCCTTCCCAGCTTCCAGTTGGATACAGGACCTGGGCCAGG
|||||||||||||| |||||||||||||||||||||||| ||||||||||||||||||||
372-seq + CTCTGCTGCTGACA-TCAGCCCCTTCCCAGCTTCCAGTTTGATACAGGACCTGGGCCAGG
435-seq + AGAGCAGGGAGGACACTGTGGAAATGCGGCCAGGCCATC-AGGGGCCTCGCAGCAGGGGA
|||||||||||||||| |||||||||||||||||||||| ||||||||||||||||||||
372-seq + AGAGCAGGGAGGACAC-GTGGAAATGCGGCCAGGCCATCCAGGGGCCTCGCAGCAGGGGA
435-seq + CTCGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCT
|| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + CTGGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCT
435-seq + GCCTGTGGTCTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCC
||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + GCCTGTGGT-TTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCC
435-seq + TGGCCTGGGAGATATTTTTGCTGCACTTTGGAGCCACCCCGCCCCCTGGATACTCAGACC
||||||||||||||||||||||||||||| |||||||||||||||||||| |||||||||
372-seq + TGGCCTGGGAGATATTTTTGCTGCACTTT-GAGCCACCCCGCCCCCTGGA-ACTCAGACC
435-seq + CTGCACA
|||||||
372-seq + CTGCACAGTCCATGCCATAACAATGACGACCACTTCCAATTGTTTCCTAGCTGGAGAGGC
435-seq +
372-seq + GGGGAGGGGAGCACTGTTTGGGAAGGGGGGGAGCCTGGGGGAAATGCTCCTAGTGACAAC
435-seq +
372-seq + A