请教关于一个字符串算法的问题(200分)

  • 主题发起人 主题发起人 daniel_zan
  • 开始时间 开始时间
D

daniel_zan

Unregistered / Unconfirmed
GUEST, unregistred user!
在一个打字的程序中有两个种文字串,如何区别并找出异同,具体如下:
原文如: 一二三四五六七八 
输入为: 十二二三四五一
要求把漏打的,错打的,多打的,打对的找出来,其中打对的用()表示
结果应为 
一二三四五六七八
一()()()()六七八  其中 一六七八(漏打),二三四五(正确)
十二()()()()一 其中 十二一(多打或错打),二三四五(正确)
十二二三四五一

请问在delphi中如何实现,请不吝赐教.
请顺便mail给我,daniel_zan@21cn.com
 
很简单吧?两个字符串,逐个对比,一致就正确,不一致则输入为错打或多大,原文为漏打。
 
要判断漏打的话,必须有个标准。比如原文是“一二三四”,输入“三四”。
可以认为 1)“一二”错打为“三四”,漏打“三四”
2)漏打“一二”
当然按常识判断可以认为是2),但要编程的话,就一定要提出具体的判断标准,
比如错打的序列有多少个和后面的原文序列匹配才认为是漏打,等等……
还是要把所有可能的情况都找出来?
 
同意kidneyball
需要一个判断标准,如何才算对、错?
一二三四五六七八
一()()()()六七八  其中 一六七八(漏打),二三四五(正确)
十二()()()()一 其中 十二一(多打或错打),二三四五(正确)
~~~~~~~~~~~~~~
十()二()()()一 不行吗?

十二二三四五一

 
十()二()()()一 当然可以,上面的例子比较简单,一个一个的比显然是不行的.
我试过每一行对于另一行的一个汉字进行比较,当一行有两个
重复的字就不行了.难道真的就没有别的方法了吗?
例如
一二三四一五六七八
零五九二三四五一 显然一个个比就不行了.
 
这属于人工智能的范畴,算法设计很重要,不能简单处理
 
呵呵,用alignment算法,应该没问题。这是生物学上,核酸比较的一种算法。
 
只要确定标准,处理起来还是比较简单的。
 
请问你使用来干什么? 呵呵
如果是来做打字训练,就好象daiqingbo所说的那样写
如果不是,请你先写出目的,然后等等我

 
这样需要复杂的判断,其实我在字符串处理方面就遇到各种头痛的情况,而且这个标准具有
不定性,可以有各种答案啊!:(
 
是一个打字考试的程序(现在只是单机版),但在具体要求方面却不是只简单的一个一个对比.
因为程序要求它可以多打也可以漏打等.可以说有一点接近人工智能的考虑方式.
我试过好多种方法,可惜都不成功.
我看见了一个台湾人编的DOS下打字程序可以判别,可是没有源程序.
可能它也有缺点,但到现在还没有发现什么问题.我想应该有一个较好的算法,可以实现.
那个什么alignment算法,我可不会用,也没听说过.
请大家帮帮忙,如果我想到了较好的方法,一定和大家讨论讨论.
 
以前的程序,是c++ 写的。原来的目的是,两个DNA 序列是有不同的,需要找出一种
匹配,使得分数最高。错配、缺失都要扣分。我想,跟daniel_zan的要求差不多。

以下是子程序align1.c
#include<stdlib.h>
#include<fstream.h>
#include<assert.h>

#define SHOW_BLANK

const MAX_LENGTH = 50000;

//Used in show alignment
const LINE_LENGTH = 60;
const NAME_LENGTH = 10;

//Alignment score
const MATCH = 2;
const MISMATCH = -6;
const EXTEND = 6;

static int Inited = 0;

static int V[128][128];
static int ICC[MAX_LENGTH],IRR[MAX_LENGTH];// saving matrix scores

static int StartA,EndA,StartB,EndB;// Alignment region
static int *sapp
// Current script append ptr
static int last
// Last script op appended
static int s_end
// Operation region
static int al_len
// Alignment length

#define gap(k) ((k) <= 0 ? 0 : EXTEND * (k)) // k-symbol indel cost

// Append "Delete k" op
#define DEL(k) /
{ /
al_len += k
/
if (last < 0){ /
last = sapp[-1] -= (int)(k)
/
}else{ /
last = *(sapp++) = -(int)(k)
/
s_end++
/
} /
}

// Append "Insert k" op
#define INS(k) /
{ /
al_len += k
/
if (last > 0){ /
last = sapp[-1] += (int)(k)
/
}else{ /
last = *(sapp++) = (int)(k)
/
s_end++
/
} /
}

// Append "Replace" op
#define REP /
{ last = *(sapp++) = 0
/
s_end++
/
al_len ++
/
}

void InitV(void)
{
for(int i = 0;i < 128;i++){
for(int j = 0;j < 128;j ++){
V[j] = MISMATCH;
if(i == j) V[j] = MATCH;
if(i == 'N' || j == 'N')V[j] = MISMATCH + 1;
}
}
Inited = 1;
}

//Find the alignment region between sequence A and sequence B
//If return zero,means no alignment region;
//else return alignment score
//Length of sequence A is M,length of sequence B is N
//The answer is recorded in StartA,EndA,StartB,EndB
//Then true alignment will be computed in this region
int FindRegion1(char *A,char *B,int M,int N)
{
int i,j
// row and column indices
int c
// best score at current point
int f
// best score ending with insertion
int d
// best score ending with deletion
int p
// best score at (i-1, j-1)
int ci
// end-point associated with c
int pi
// end-point associated with p

int Score = 0
// max score
int *va;

// Initialize the 0 th row
for ( j = 1
j <= N
j++ ){
ICC[j] = 0;
IRR[j] = -j;
}

for ( i = 1
i <= M
i++ ){
p = 0;
pi = i - 1;
ICC[0] = 0
// Initialize column 0
IRR[0] = i;
va = V[A[i-1]];
for ( j = 1
j <= N
j++ ){
f = ICC[j-1] - EXTEND;
d = ICC[j] - EXTEND;
c = p + va[B[j-1]]
// replace A[i-1] with B[j-1]
if ( c < d ){
c = d;
ci = IRR[j];
}else if ( c < f ){
c = f;
ci = IRR[j-1];
}else {
ci = pi;
}
p = ICC[j];
pi = IRR[j];

ICC[j] = c;
IRR[j] = ci;
}
if ( c > Score ){
Score = c;
EndA = i;
EndB = N;
StartA = ci;
}
}
for(j = 0;j <= N;j ++){
if(ICC[j] > Score ){
Score = ICC[j];
EndA = M;
EndB = j;
StartA = IRR[j];
}
}
if ( Score ){
if ( StartA < 0 ){
StartB = - StartA;
StartA = 0;
}else {
StartB = 0;
}
}

return(Score);
}

//Now alignment sequence A &amp
sequence B and record the alignment path
int Align1(char A[],char B[],int M,int N)
{
int midi,midj,midc,*va;
int i,j,c,e,d,s,t;

if (N <= 0){
if (M > 0) DEL(M)
return -gap(M);
}
if (M <= 1){
if (M <= 0){
INS(N)
return -gap(N);
}

midc = - gap(N+1);
midj = 0;

va = V[A[0]];
for (j = 1
j <= N
j++){
c = -gap(j-1) + va[B[j-1]] - gap(N-j);
if (c > midc){
midc = c;
midj = j;
}
}
if (midj == 0){
DEL(1)
INS(N)
}else {
if (midj > 1) INS(midj-1)
REP
if (midj < N) INS(N-midj)
}
return midc;
}

midi = M / 2;

t = EXTEND;
for(i = 0;i <= N;i++){
ICC = t = t - EXTEND;
}

t = 0;
for(i = 1;i <= midi;i++){
s = ICC[0];
ICC[0] = t = t - EXTEND;
va = V[A[i-1]];
for(j = 1;j <= N;j++){
c = s + va[B[j-1]]
//change
e = ICC[j-1] - EXTEND
//insert
d = ICC[j] - EXTEND
//delete
s = ICC[j]
//keep old value
if((c > e) &amp;&amp
(c > d)){
ICC[j] = c;
}else if(e > d){
ICC[j] = e;
}else {
ICC[j] = d;
}
}
}

t = EXTEND;
for (j = N
j >= 0
j--){
IRR[j] = t = t - EXTEND;
}

t = 0;
for (i = M-1
i >= midi
i--){
s = IRR[N];
IRR[N] = t = t - EXTEND;
va = V[A];
for (j = N-1
j >= 0
j--){
c = s + va[B[j]]
//change
e = IRR[j+1] - EXTEND
//insert
d = IRR[j] - EXTEND
//delete
s = IRR[j]
//keep old value
if((c > e) &amp;&amp
(c > d)){
IRR[j] = c;
}else if(e >= d){
IRR[j] = e;
}else {
IRR[j] = d;
}
}
}

midc = ICC[0]+IRR[0];
midj = 0;
for (j = 0
j <= N
j++){
if ((ICC[j] + IRR[j]) > midc){
midc = ICC[j] + IRR[j];
midj = j;
}
}

(void)Align1(A,B,midi,midj);
(void)Align1(A+midi,B+midj,M-midi,N-midj);

return midc;
}

//the main program call this to do alignment
//Alignment path will be recorded in argument *s
//Argument end means length of argument *s
//Argument len meana length of alignment
int ALIGN1(char A[],char B[],int M,int N,int **s,int *end,int *len)
{
int score;

if(NULL != *s)delete[] *s;
if((sapp = *s = new int[M+N]) == NULL){
cerr << "Memory out!" << endl;
cin.get();
exit(-1);
}

s_end = 0;
last = 0;
al_len = 0;

if(!Inited) InitV();//Initialize alignment score array

score = FindRegion1(A,B,M,N);
if(!score){
delete[] *s;
*s = NULL;
return(0);
}

if(StartA) DEL(StartA)
if(StartB) INS(StartB)

score = Align1(A+StartA,B+StartB,EndA - StartA,EndB - StartB);

if(EndA < M) DEL(M - EndA)
if(EndB < N) INS(N - EndB)

*end = s_end;
*len = al_len;

return(score);
}

void ShowAlign(ostream *fout,char A[],char B[],char NameA[],char NameB[],int MM,int NN,int *ss,int end,char orientA,char orientB)
{
char (*c)[3];
char *cc;
if((cc = new char[3 * (MM + NN)]) == NULL){
cerr << "Memory out !" << endl

cin.get();
exit(-1);
}

c = (char (*)[3])cc;

int i,j,*s;
int aa,bb,k;
int AllNumber = 0,ErrorNumber = 0;

s = ss;
aa = bb = k = 0;

*fout << "Head is " << s[0] << endl;
*fout << "End is " << s[end - 1] << endl;

if(s[0] > 0){
for(bb = 0;bb < s[0];bb++){
NULL;
#ifdef SHOW_BLANK
c[k][0] = ' ';
c[k][1] = ' ';
c[k][2] = B[bb];
k ++;
#endif
}
}else if(s[0] < 0){
for(aa = 0;aa < -s[0];aa++){
NULL;
#ifdef SHOW_BLANK
c[k][0] = A[aa];
c[k][1] = ' ';
c[k][2] = ' ';
k ++;
#endif
}
}else {
AllNumber ++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]){
c[k][1] = '|';
}else {
c[k][1] = ' ';
ErrorNumber++;
}
aa++;
bb++;
k++;
}

for(i = 1;i < end - 1;i++){
if(s > 0){
AllNumber += s;
ErrorNumber += s;
for(int ii = 0;ii < s;ii++){
c[k][0] = '-';
c[k][2] = B[bb++];
c[k++][1] = ' ';
}
}else if(s < 0){
AllNumber -= s;
ErrorNumber -= s;
for(int ii = 0;ii < -s;ii++){
c[k][0] = A[aa++];
c[k][2] = '-';
c[k++][1] = ' ';
}
}else {
AllNumber++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]) c[k][1] = '|';
else {
c[k][1] = ' ';
ErrorNumber ++;
}
aa++;
bb++;
k++;
}
}

if(s[end - 1] == 0){
AllNumber++;
c[k][0] = A[aa];
c[k][2] = B[bb];
if(c[k][0] == c[k][2]) c[k][1] = '|';
else {
c[k][1] = ' ';
ErrorNumber ++;
}
k++;
}
#ifdef SHOW_BLANK
else if(s[end - 1] > 0){
for(int ii = 0;ii < s[end - 1];ii ++){
c[k][0] = ' ';
c[k][1] = ' ';
c[k++][2] = B[bb++];
}
}else {
for(int ii = 0;ii < -s[end - 1];ii++){
c[k][0] = A[aa++];
c[k][1] = ' ';
c[k++][2] = ' ';
}
}
#endif

*fout << "dErrorRate = " << (double)ErrorNumber / (double)AllNumber << endl;
*fout << endl;
for(i = 0;i < k;i += LINE_LENGTH){
int ii;
for(ii = 0;ii < NAME_LENGTH;ii++){
if(NameA[ii] != 0)
*fout << NameA[ii];
else {
for(NULL;ii < NAME_LENGTH;ii++) *fout << ' ';
break;
}
}
*fout << orientA;
*fout << " ";

for(j = i;j < i+LINE_LENGTH &amp;&amp
j < k;j++)*fout << c[j][0];
*fout << endl;

for(j = 0;j < NAME_LENGTH+3;j++) *fout << ' ';
for(j = i;j < i+LINE_LENGTH &amp;&amp
j < k;j++)*fout << c[j][1];
*fout << endl;

for(ii = 0;ii < NAME_LENGTH;ii++){
if(NameB[ii] != 0)
*fout << NameB[ii];
else {
for(NULL;ii < NAME_LENGTH;ii++) *fout << ' ';
break;
}
}
*fout << orientB;
*fout << " ";

for(j = i;j < i+LINE_LENGTH &amp;&amp
j < k;j++)*fout << c[j][2];
*fout << endl << endl;
}
*fout << endl;

return;
}

以下是主程序,也就是一个小例子:
#include<iostream.h>

extern int ALIGN1(char A[],char B[],int M,int N,int **s,int *end,int *len);
extern void ShowAlign(ostream *fout,char A[],char B[],char NameA[],char NameB[],int MM,int NN,int *ss,int end,char orientA,char orientB);


void main(int argc, char *argv[])
{
char NameA[] = "435-seq";
int M = 546;
char A[] =
"GCACTGTCCCTCTCCATTAAATGAAGTGCTTGGATGGATTGCTATAAGGCCTGTCTGGCTCGGAGGCTTGGTGCCTCAACTCATTGCCTGCTGGTCCAAGGAAATCGAGTGCCTGAGCCAGAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTTCCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCGTTCTCTGCTGCTGAC
ACTCAGCCCCTTCCCAGCTTCCAGTTGGATACAGGACCTGGGCCAGGAGAGCAGGGAGGACACTGTGGAAATGCGGCCAGGCCATCAGGGGCCTCGCAGCAGGGGACTCGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCTGCCTGTGGTCTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCCTGGCCTGGGAGATATTTTTGCTGCACTT
TGGAGCCACCCCGCCCCCTGGATACTCAGACCCTGCACA";

char NameB[] = "372-seq";
int N = 565;
char B[] =
"CTGGTCCAAGGAAATCAGTGCCTGAGCCAGAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTTCCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCCTTCTCTGCTGCTGACATCAGCCCCTTCCCAGCTTCCAGTTTGATACAGGACCTGGGCCAGGAGAGCAGGGAGGACACGTGGAAATGCGGCCAGGCCATCCAGGGGC
CTCGCAGCAGGGGACTGGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCTGCCTGTGGTTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCCTGGCCTGGGAGATATTTTTGCTGCACTTTGAGCCACCCCGCCCCCTGGAACTCAGACCCTGCACAGTCCATGCCATAACAATGACGACCACTTCCAATTGTTTCCTAGCTGGAGAGGCGGG
GAGGGGAGCACTGTTTGGGAAGGGGGGGAGCCTGGGGGAAATGCTCCTAGTGACAACA";

int *s,s_end,length;
ALIGN1(A,B,M,N,&amp;s,&amp;s_end,&amp;length);
ShowAlign(&amp;cout,A,B,NameA,NameB,M,N,s,s_end,'+','+');
}

运行结果如下:
Head is -90
End is 114
dErrorRate = 0.0218818

435-seq + GCACTGTCCCTCTCCATTAAATGAAGTGCTTGGATGGATTGCTATAAGGCCTGTCTGGCT

372-seq +

435-seq + CGGAGGCTTGGTGCCTCAACTCATTGCCTGCTGGTCCAAGGAAATCGAGTGCCTGAGCCA
|||||||||||||||| |||||||||||||
372-seq + CTGGTCCAAGGAAATC-AGTGCCTGAGCCA

435-seq + GAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTT
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + GAGTCCCCATCTCTAAGCTCCATGGTTATTGTTCTTGCCACCTGGCTAGGAAATGTCCTT

435-seq + CCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCGTT
||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||
372-seq + CCAGCTGCCCCAGTCTAGCTGCCTCACCCTGGGGCCATGCCCCAACTCTGTCCTACCCTT

435-seq + CTCTGCTGCTGACACTCAGCCCCTTCCCAGCTTCCAGTTGGATACAGGACCTGGGCCAGG
|||||||||||||| |||||||||||||||||||||||| ||||||||||||||||||||
372-seq + CTCTGCTGCTGACA-TCAGCCCCTTCCCAGCTTCCAGTTTGATACAGGACCTGGGCCAGG

435-seq + AGAGCAGGGAGGACACTGTGGAAATGCGGCCAGGCCATC-AGGGGCCTCGCAGCAGGGGA
|||||||||||||||| |||||||||||||||||||||| ||||||||||||||||||||
372-seq + AGAGCAGGGAGGACAC-GTGGAAATGCGGCCAGGCCATCCAGGGGCCTCGCAGCAGGGGA

435-seq + CTCGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCT
|| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + CTGGAGGGGGAGCAGTGTCCAGGGCCAGAAGTGCCCTGCGGGAGAGCCAGGACATTGGCT

435-seq + GCCTGTGGTCTTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCC
||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||
372-seq + GCCTGTGGT-TTGGTGGTCGTGGTCAGTTCCCTCTCCTGCCAGCTGTGGAATGTGAGGCC

435-seq + TGGCCTGGGAGATATTTTTGCTGCACTTTGGAGCCACCCCGCCCCCTGGATACTCAGACC
||||||||||||||||||||||||||||| |||||||||||||||||||| |||||||||
372-seq + TGGCCTGGGAGATATTTTTGCTGCACTTT-GAGCCACCCCGCCCCCTGGA-ACTCAGACC

435-seq + CTGCACA
|||||||
372-seq + CTGCACAGTCCATGCCATAACAATGACGACCACTTCCAATTGTTTCCTAGCTGGAGAGGC

435-seq +

372-seq + GGGGAGGGGAGCACTGTTTGGGAAGGGGGGGAGCCTGGGGGAAATGCTCCTAGTGACAAC

435-seq +

372-seq + A
 
接受答案了.
 
后退
顶部