"String Matching Routines" by RON NOSSAMAN

[Back to STRINGS SWAG index] [Back to Main SWAG index] [Original]
{WookieWare Home Defense Series cautiously presents
 String matching routines, Public Domain effective immediately.
 Please bestow credit in any distributed software or source.
 (Yes, they're tested. No, I don't claim to have written them in half
  an hour.)                                             }


Uses crt,dos;
const seeds:string='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
var string1,string2:string;
    i:integer;
    percent,percent2,percent3:integer;



function similar100(st1,st2:string):word;
{This one started the whole thing. Loosely based on an algorithm called
 SIMILAR.ASM  written by John W. Ratcliff and David E. Metzener
 only a lot more understandable. Returns percentage match. Pretty slow
 compared to the ASM versions. Case sensitive.
 Ron Nossaman Sept. 30 1994 }
var score:integer;

   procedure compare(s1,s2:string);
   var s1l,s1r,s2l,s2r,looker:integer;
   begin
      s1l:=1;s2l:=1;
      s1r:=length(s1);
      s2r:=length(s2);
      looker:=s2l;
   {increment s1, sweep s2}
      repeat
         if s1[s1l]=s2[looker] then
         begin             {got a match}
            inc(s1l);      {next position on everything}
            inc(looker);
            s2l:=looker;   {pull up starting position marker}
            inc(score);
         end else inc(looker); {no match, continue sweep}
         if looker>s2r then    {looker swept past end of string}
         begin
            looker:=s2l;     {restore looker to last unmatched position}
            if s2l>s2r then s1l:=s1r;
            inc(s1l);        {next char in first string for matching}
         end;
      until s1l>s1r;
   end;
begin
   score:=0;
   compare(st1,st2);
   compare(st2,st1);
   score:=(score*100)div(length(st1)+length(st2));
   similar100:=score;
end;




{$F+} {I don't know for sure, might be necessary in multi segment program}

Function Match(Var s1:String; Var s2:String):word;
{Uncle Ron's algorithm to compare two strings, returns percentage match}
{Case sensitive}
{Ron Nossaman Oct2, 1994}
begin
   asm
      LES DI,[S2]
      LDS SI,[S1]
      Xor dx,dx        {zero score}
      xor ax,ax
      cmp [si],al       {is byte1 a zero?}
      je @strerr       {yes, BAIL}
      cmp [di],al
      jne @docmp
@strerr:
      jmp @millertime        {BAIL}
      { ;neither strings zero length, do it}
@docmp:
      cld
      Xor ax,ax
      mov al,[di]       {get length S2}
      mov cx,ax         {save in cx}
      add ax,di
      mov bx,ax         {bx=pointer last byte S2}
      inc di            {di=pointer first byte S2}
      Xor ax,ax
      mov al,[si]
      push ax
      add ax,cx
      mov cx,ax         {total length both strings}
      pop ax
      add ax,si         {ax=pointer last byte S1}
      inc si            {si=pointer first byte S1}
      {ax=lastchar s1}
      {bx=lastchar s2}
      {si=firstchar s1}
      {di=firstchar s2}

      push cx           {save 'total' characters}
      push bx           {save s2 end}
      push ax           {save s1 end}

      mov cx,0          {indicator of first pass through compare}
      jmp @compare
@round2:
      LES DI,[S1]     {swap string beginnings}
      LDS SI,[S2]
      inc si
      inc di
      pop bx          {s2 end swapped}
      pop ax          {s1 end swapped}
                      {'total' still on stack}
      mov cx,1          {pass 2 indicator}

@compare:
      push cx     {save pass indicator}
      mov cx,di   {let keeper remember starting point}
@workloop:
      push ax       {save eos pointers to free up registers}
      push bx
      xor ax,ax
      mov al,[si]
      mov bx,ax
      mov al,[di]
      cmp ax,bx     {are chars equal?}
      jne @nomatch  {no, pass on}
      inc si        {yes, increment both string position pointers}
      inc di
      mov cx,di     {keeper remembers new starting position}
      inc dx        {score}
      jmp @progress
@nomatch:
      inc di    {no match, try next char in second string}
@progress:
      pop bx        {restore end of string pointers}
      pop ax
      cmp di,bx     {is string 2 used up without match?}
      jle @nofix    {nope, go on}
      mov di,cx     {restore last unmatched position}
      cmp di,bx     {is string2 matched to the end?}
      jle @nofix2   {no, go try next letter of string1}
      mov si,ax     {yes, nothing left to compare, cancel further search}
@nofix2:
      inc si        {next char string1}
@nofix:
      cmp si,ax     {done yet?}
      jle @workloop {nope, hiho}
      pop cx        {retreive pass indicator}
      cmp cx,0      {0=pass1}
      je @round2    {go back for pass 2}
      mov ax,dx     {score}
      mov cx,100
      mul cx
      pop cx      {get 'total' characters}
      div cx
@millertime:
      mov @result,ax
   end;
end;



Function Match2(Var s1:String; Var s2:String):word;
{Uncle Ron's algorithm to compare two strings, returns percentage match}
{a tad smaller, faster. Still Case sensitive}
{Ron Nossaman Oct 4, 1994}
begin
   asm
      les di,[s2]
      lds si,[s1]
      xor ax,ax
      mov al,[si]
      cmp al,0
      je @nolength
      mov cx,ax        {cx= length of string1}
      mov al,[di]
      cmp al,0
      jne @docmp       {ax= length of string2}
@nolength:
      jmp @millertime        {BAIL}

@docmp:       { ;neither strings zero length, do it}
      cld
      mov dx,ax         {save length(s2)}
      add ax,di
      mov bx,ax         {bx= pointer last char s2}
      inc di            {di= pointer first char s2}
      mov ax,dx         {retreive length(s2)}
      add ax,cx         {+length(s1)}
      push ax           {save total length both strings until final scoring}
      mov ax,cx         {length(s1)}
      add ax,si         {ax=pointer last char s1}
      inc si            {si=pointer first char s1}
      Xor dx,dx         {zero score}


      {cast:}           {ax=lastchar s1}
                        {bx=lastchar s2}
                        {si=firstchar s1}
                        {di=firstchar s2}
                        {dx=accumulated score}
                        {cx=temporary position marker during compare}


      mov cx,0          {indicator of first pass through compare}
      jmp @compare
@round2:
      les di,[s1]     {swap string beginnings}
      lds si,[s2]
      inc si
      inc di
      xchg ax,bx      {swap s1 and s2 end pointers}
                      {'total' still on stack}
      mov cx,1          {pass 2 indicator}

@compare:
      push cx     {save pass indicator}
      mov cx,di   {let keeper remember starting point}
@workloop:
      push ax       {save eos pointer to free up ax register}
      mov al,[si]
      mov ah,al
      mov al,[di]
      cmp al,ah     {are chars equal?}
      jne @nomatch  {no, pass on}
      inc si        {yes, increment both string position pointers}
      inc di
      mov cx,di     {keeper remembers new starting position}
      inc dx        {score}
      jmp @progress
@nomatch:
      inc di    {no match, try next char in second string}
@progress:
      pop ax       {restore end of string pointer}
      cmp di,bx     {is string 2 used up without match?}
      jle @nofix    {nope, go on}
      mov di,cx     {restore last unmatched position}
      cmp di,bx     {is string2 matched to the end?}
      jle @nofix2   {no, go try next letter of string1}
      mov si,ax     {yes, nothing left to compare, cancel further search}
@nofix2:
      inc si        {next char string1}
@nofix:
      cmp si,ax     {done yet?}
      jle @workloop {nope, hiho}
      pop cx        {retreive pass indicator}
      cmp cx,0      {0=pass1}
      je @round2    {go back for pass 2}
      mov ax,dx     {score}
      mov cx,100
      mul cx
      pop cx      {get 'total' characters}
      div cx
@millertime:
      mov @result,ax
   end;
end;






Function Match3(Var s1:String; Var s2:String; case_sensitive:boolean):word;
{Uncle Ron's algorithm to compare two strings, returns percentage match}
{Case sensitive/not switch    Most versatile, speed comparison varies}
{Ron Nossaman Oct 29, 1994}
begin
   asm
      push ds
      les di,[s2]
      lds si,[s1]
      xor ax,ax
      SEGDS mov al,[si]
      cmp al,0
      je @nolength
      mov cx,ax        {cx= length of string1}
      SEGES mov al,[di]
      cmp al,0
      jne @docmp       {ax= length of string2}
@nolength:
      jmp @millertime        {BAIL}

@docmp:       { ;neither strings zero length, do it}
      cld
      mov dx,ax         {save length(s2)}
      add ax,di
      mov bx,ax         {bx= pointer last char s2}
      inc di            {di= pointer first char s2}
      mov ax,dx         {retreive length(s2)}
      add ax,cx         {+length(s1)}
      push ax           {save total length both strings until final scoring}
      mov ax,cx         {length(s1)}
      add ax,si         {ax=pointer last char s1}
      inc si            {si=pointer first char s1}
      Xor dx,dx         {zero score}


      {cast:}           {ax=lastchar s1}
                        {bx=lastchar s2}
                        {si=firstchar s1}
                        {di=firstchar s2}
                        {dx=accumulated score}
                        {cx=temporary position marker during compare}


      mov cx,0          {indicator flag of first pass through compare}
                   {cheap dodge, since you can't call & ret in T.P. asm}
      jmp @compare
@round2:
      les di,[s1]     {swap string beginnings}
      lds si,[s2]
      inc si
      inc di
      xchg ax,bx      {swap s1 and s2 end pointers}
                      {'total' still on stack}
      mov cx,1          {pass 2 indicator}

@compare:
      push cx     {save pass indicator}
      mov cx,di   {let keeper remember starting point}
@workloop:
      push ax       {save eos pointer to free up ax register}
      SEGDS mov al,[si]
      cmp case_sensitive,0
      jnz @CaseOK1
      cmp al,'Z'
      jg  @CaseOK1
      cmp al,'A'
      jl  @CaseOK1
      or al,$20
@CaseOK1:
      mov ah,al
      SEGES mov al,[di]
      cmp case_sensitive,0
      jnz @CaseOK2
      cmp al,'Z'
      jg  @CaseOK2
      cmp al,'A'
      jl  @CaseOK2
      or al,$20
@CaseOK2:
      cmp al,ah     {are chars equal?}
      jne @nomatch  {no, pass on}
      inc si        {yes, increment both string position pointers}
      inc di
      mov cx,di     {keeper remembers new starting position}
      inc dx        {score}
      jmp @progress
@nomatch:
      inc di    {no match, try next char in second string}
@progress:
      pop ax       {restore end of string pointer}
      cmp di,bx     {is string 2 used up without match?}
      jle @nofix    {nope, go on}
      mov di,cx     {restore last unmatched position}
      cmp di,bx     {is string 2 matched to the end?}
      jle @nofix2   {no, go try next letter of string1}
      mov si,ax     {yes, nothing left to compare, cancel further search}
@nofix2:
      inc si        {next char string1}
@nofix:
      cmp si,ax     {done yet?}
      jle @workloop {nope, hiho}
      pop cx        {retreive pass indicator}
      cmp cx,0      {0=pass1}
      je @round2    {go back for pass 2}
      mov ax,dx     {score}
      mov cx,100
      mul cx
      pop cx      {get 'total' characters}
      div cx
@millertime:
      mov @result,ax
      pop ds
   end;
end;




function bickell2(s1,s2:string):integer; {not quite, but similar}
const
   weight:array[ord('a')..ord('{')]of byte=(
        3,6,5,4,3,5,5,4,3,8,7,4,5,3,3,5,7,4,3,3,4,6,5,8,8,9,0);
     (* a b c d e f g h i j k l m n o p q r s t u v w x y z { *)
var sort1,sort2:string;
    i,bick1,bick2:integer;
    b1,b2:array[ord('a')..ord('{')]of byte;

begin
   sort1:=s1; sort2:=s2;
   for i:=1 to length(sort1) do if (sort1[i]<'a')or(sort1[i]>'z') then
   begin
      case sort1[i] of
       'A'..'Z':sort1[i]:=char(ord(sort1[i])or 32);
        else sort1[i]:='{';
      end;
   end;
   for i:=1 to length(sort2) do if (sort2[i]<'a')or(sort2[i]>'z') then
   begin
      case sort2[i] of
       'A'..'Z':sort2[i]:=char(ord(sort2[i])or 32);
        else sort2[i]:='{';
      end;
   end;
   fillchar(b1,sizeof(b1),0);
   fillchar(b2,sizeof(b2),0);

  { weed out duplicates, sort}
   for i:=1 to length(sort1) do b1[ord(sort1[i])]:=weight[ord(sort1[i])];
   for i:=1 to length(sort2) do b2[ord(sort2[i])]:=weight[ord(sort2[i])];

  {get total for comparison}
   bick1:=0;
   for i:=ord('a') to ord('{') do bick1:=bick1+b1[i]+b2[i];

  {add up all letters common to both words}
   bick2:=0;
   for i:=ord('a') to ord('{') do if b1[i]<>0 then
     if (b1[i]=b2[i]) then bick2:=bick2+b1[i]+b2[i];

  {figure match}
   bickell2:=(bick2*100)div bick1;
end;



procedure timer;
var i:integer;
    oldpercent,percent:integer;
    h1,m1,s1,hund1,h2,m2,s2,hund2,h3,m3,s3,hund3:Word;
    strt,stp:real;
begin
   string1:='ThanKyo';
   string2:='tHanKyouR';
   write('Timing "Similar100" ');
   GetTime(h1,m1,s1,hund1);
   for i:=1 to 30000 do percent:=similar100(string1,string2);
   gettime(h2,m2,s2,hund2);
   strt:=(h1*3600)+(m1*60)+s1+(hund1/100);
   stp:=(h2*3600)+(m2*60)+s2+(hund2/100);
   WriteLn(percent,'%  Elapsed time ',(stp-strt):0:2,' seconds');
   write('Timing "Match"      ');
   GetTime(h1,m1,s1,hund1);
   for i:=1 to 30000 do percent:=match(string1,string2);
   gettime(h2,m2,s2,hund2);
   strt:=(h1*3600)+(m1*60)+s1+(hund1/100);
   stp:=(h2*3600)+(m2*60)+s2+(hund2/100);
   WriteLn(percent,'%  Elapsed time ',(stp-strt):0:2,' seconds');
   write('Timing "Match2"     ');
   gettime(h1,m1,s1,hund1);
   for i:=1 to 30000 do percent:=match2(string1,string2);
   gettime(h2,m2,s2,hund2);
   strt:=(h1*3600)+(m1*60)+s1+(hund1/100);
   stp:=(h2*3600)+(m2*60)+s2+(hund2/100);
   WriteLn(percent,'%  Elapsed time ',(stp-strt):0:2,' seconds');
   delay(100);
   write('Timing "Match3"     ');
   GetTime(h1,m1,s1,hund1);
   for i:=1 to 30000 do percent:=match3(string1,string2,false);
   gettime(h2,m2,s2,hund2);
   strt:=(h1*3600)+(m1*60)+s1+(hund1/100);
   stp:=(h2*3600)+(m2*60)+s2+(hund2/100);
   WriteLn(percent,'%  Elapsed time ',(stp-strt):0:2,' seconds');
   delay(100);
   write('Timing "Bickell"     ');
   GetTime(h1,m1,s1,hund1);
   for i:=1 to 30000 do percent:=bickell2(string1,string2);
   gettime(h2,m2,s2,hund2);
   strt:=(h1*3600)+(m1*60)+s1+(hund1/100);
   stp:=(h2*3600)+(m2*60)+s2+(hund2/100);
   WriteLn(percent,'%  Elapsed time ',(stp-strt):0:2,' seconds');
end;
{$F-}

Begin
   clrscr;

  repeat
     string1:='';
     for i:=1 to random(10)+2 do string1:=string1+copy(seeds,random(52)+1,1);
     string2:='';
     for i:=1 to random(10)+2 do string2:=string2+copy(seeds,random(52)+1,1);
      percent:=bickell2(String1,String2);
      percent2:=match3(string1,string2,false);
      if (percent>50)or(percent2>50)
         then writeln(percent,' ',percent2,'  ', string1,'  ',string2);
  until keypressed;

  for i:=1 to 3 do timer;
end.
[Back to STRINGS SWAG index] [Back to Main SWAG index] [Original]