[Back to DELPHI SWAG index]  [Back to Main SWAG index]  [Original]

//------------------------------------------------------------------------------
// RegExpUnit.Pas			Copyright (C) 1997  Object Dynamics Ltd.
//
// An implementation of regular expression searching for Delphi 2 and later. The
// code is based on that presented by Kernighan & Plauger in their book
// "Software Tools in Pascal" (ISBN 0-201-10342-7). You should consult this book
// for a detailed explanation of how this implementation works, as the code is
// not particularly heavily commented!
//
// 								*** IMPORTANT ***
//
// By using this code, you accept the following conditions:
//
//  	You may use and adapt this code freely, but it remains the
// 	copyright of Object Dynamics Ltd. Any adaptations must retain the
// 	copyright message at the head of this file.
//
//		You use this code at your own risk. Object Dynamics is not responsible
//    for any loss or damage caused by programs using this code.
//
//
// History:
//
//		Version 1.0 Created by Neil Butterworth, September 1997
//       - Fixed problem with .* not matching entire line
//
//------------------------------------------------------------------------------

unit ODRegExpUnit;

interface

//------------------------------------------------------------------------------
// Find
//
// Searches the string "str" from position "start" for the pattern "pat". The
// pattern must have been constructed using one of the two MakePattern functions
// described below.
//
// The function returns the position of the pattern in the string, or zero if
// no match is found. If there is a match, the length of matched characters
// is returned in the "len" parameter.
//
// If the "csense" parameter is false, the case of characters is ignored.
//------------------------------------------------------------------------------

function Find( const str : string; start : integer;
					const pat : string; var len : integer;
               csense : boolean ) : integer;


//------------------------------------------------------------------------------
// MakePattern
//
// Constructs an encoded version of a regular expression, required for input
// as the "pat" parameter of Find, described above.
//
// MakePattern begins construction of the pattern at the position indicated
// by "start" - this should almost always be 1. This extraneous parameter is
// maintained for future Software Tools compatibility.
//------------------------------------------------------------------------------

function MakePattern( const pat : string; start : integer ) : string;

//------------------------------------------------------------------------------
// MakePatternNoRegexp
//
// As above, but treats all regular expression characters as non-special. For
// example:
//
//		MakePattern( '[a-z]') wo
//
// would create apattern that would match the string '[a-z]', rather than
// one that would match a single lower-case character.
//------------------------------------------------------------------------------

function MakePatternNoRegEx( const pat : string; start : integer ) : string;


//------------------------------------------------------------------------------


implementation

uses
	SysUtils;

//------------------------------------------------------------------------------
// The following constants define the symbols used by regular expressions. The
// set used is identical to that used by UNIX programs such as vi and ed, but
// does not include extended regular expressions as used by (for example )nawk.
//------------------------------------------------------------------------------

const
	CLOSURE = '*';					// match zero or more of preceding character
   BOL = '^';                 // beginning of line
	EOL = '$';                 // end of line
	ESCAPE = '\';              // escape next character
   DASH = '-';                // used in [a-z] type expressions
   NEGATE = '^';              // negate next character/range in [a-z] expression
   CCL ='[';                  // intro for [a-z] expressions
   CCLEND = ']';             	// outro for [a-z] expressions
   ANY = '.';                	// match any single character


//------------------------------------------------------------------------------
// These are used for internally encoding expressions
//------------------------------------------------------------------------------

   NCCL = '!';    				// negate [a-z] must not be same as NEGATE!!!
   LITCHAR = '@';           	// quote single literal character
   TAB = char( 9 );           // tab

//------------------------------------------------------------------------------
// Convert a single character to uppercase (if possible)
//------------------------------------------------------------------------------

function ToUpper( c : char ) : char;
begin
	if (( c >= 'a' ) and (c <='z')) then
   	result := char(integer(c) - 32)
   else
   	result := c;
end;

//------------------------------------------------------------------------------
// Compare two characters for equality. If csense is false, the comparison
// ignores case.
//------------------------------------------------------------------------------

function CmpChar( c1, c2 : char; csense : boolean ) : boolean;
begin
	if ( csense ) then
   	result := c1 = c2
   else
   	result := ToUpper(c1) = ToUpper(c2);
end;

//------------------------------------------------------------------------------
// Check if single character is alphanumeric.
//------------------------------------------------------------------------------

function IsAlphaNum( c : char ) : boolean;
begin
	result := ((c>= 'A') and (c<='Z'))
   				or ((c >='a') and (c<='z'))
               or ((c>='0') and (c<='9'));
end;

//------------------------------------------------------------------------------
// Check if index is beyond the last character position in a string.
//------------------------------------------------------------------------------

function AtEnd( const str : string; index : integer ) : boolean;
begin
	result := index > Length( str );
end;

//------------------------------------------------------------------------------
// Expand an escaped character.
//------------------------------------------------------------------------------

function Esc( const s : string; var i : integer ) : char;
begin
	if ( s[i] <> ESCAPE ) then
   	result := s[i]
   else if ( AtEnd( s, i+1 )) then
      result := ESCAPE
   else begin
   	inc( i );
      if ( s[i] = 't' ) then
      	result := TAB
      else
      	result := s[i];
   end;
end;

//------------------------------------------------------------------------------
// Expand a character class in the form c1-c2. For example a-d expands to "abcd".
//------------------------------------------------------------------------------

function ExpandDash( delim : char; const pat : string; var i : integer ) : string;
var
	k : char;
begin
	result := '';
	while( (pat[i] <> delim) and (not AtEnd( pat, i ))) do begin
   	if ( pat[i] = ESCAPE ) then
      	result := result + Esc( pat, i )
      else if ( pat[i] <> DASH ) then
      	result := result + pat[i]
      else if ( AtEnd( pat, i )) then
      	result := result + DASH
      else if ( IsAlphaNum( pat[i-1] )
      		and IsAlphaNum( pat[i+1])
            and ( pat[i-1] <= pat[i+1])) then  begin
      	for k := char(integer(pat[i-1]) + 1) to pat[i+1] do
         	result := result +  k;
         inc( i );
      end
      else
      	result := result + DASH;
      inc( i );
   end;
end;

//------------------------------------------------------------------------------
// Expand character class in form [a-z]
//------------------------------------------------------------------------------

function ExpandCharClass( const c : string; var i : integer ) : string;
var
	countpos : integer;
   tmp : string;
begin
	result := '';
	inc( i );
   if ( c[i] = NEGATE ) then begin
   	result := result + NCCL;
      inc ( i );
   end
   else
   	result := result + CCL;
	result := result + ' ';
   countpos := Length( result );
   tmp := ExpandDash( CCLEND, c, i );
   result[countpos] := char(length(tmp));
   if ( c[i] = CCLEND ) then
   	result := result + tmp
   else
   	result := '';
end;

//------------------------------------------------------------------------------
// Insert a closure symbol at position cpos.
//------------------------------------------------------------------------------

procedure InsertClosure( var pat : string; cpos : integer );
begin
	Insert( CLOSURE, pat, cpos );
end;

//------------------------------------------------------------------------------
// Construct a pattern from an expression. A pattern is an expanded encoding
// of an expression, which the search functions need. This version ignores
// ALL regular expression characters.
//
// The "start" parameter indicates the starting point in the expression. This
// will almost always be 1.
//------------------------------------------------------------------------------

function MakePatternNoRegEx( const pat : string; start : integer ) : string;
var
	i : integer;
begin
	result := '';
   for i := start to length( pat ) do
   	result := result + LITCHAR + pat[i];
end;

//------------------------------------------------------------------------------
// As above, but handles regualr expression characters.
//------------------------------------------------------------------------------

function MakePattern( const pat : string; start : integer ) : string;
var
	p, pstart, i : integer;
begin
	i := start;
   result := '';
	pstart := 0;
  	while( not AtEnd( pat, i ) ) do begin
		if ( pat[i] = ANY ) then begin
			pstart := Length( result ) + 1;
      	result := result + ANY;
      end
      else if ( (pat[i] = BOL) and (i = start )) then begin
      	pstart := Length( result ) + 1;
      	result := result + BOL;
      end
      else if ( (pat[i] =EOL) and AtEnd(pat, i+1 )) then begin
      	pstart := length( result ) + 1;
      	result := result + EOL;
      end
      else if ( pat[i] = CCL ) then begin
      	pstart := length( result ) + 1;
      	result := result + ExpandCharClass( pat, i );
      end
      else if ( ( pat[i] = CLOSURE ) and ( i > start )) then begin
      	p := pstart;
         pstart := length( result ) + 1;
			if ( ( p < 1 ) or (result[p] in [BOL, EOL, CLOSURE]) ) then begin
         	result := '';
            exit;
         end;
         InsertClosure( result, p );
      end
      else begin
			pstart := length( result ) + 1;
      	result := result + LITCHAR + Esc( pat, i );
		end;
      inc( i );
   end;
end;

//------------------------------------------------------------------------------
// Get length of piece of a pattern. For example, the encoded length of the
// expression 'a' is 2, as it is encodeds a LITCHAR followed by an 'a'
//------------------------------------------------------------------------------

function PatSize( const pat : string; n : integer ) : integer;
begin
	if ( pat[n] = LITCHAR ) then
   	result := 2
   else if ( pat[n] in [BOL, EOL, ANY, CLOSURE] ) then
   	result := 1
   else if ( (pat[n] = CCL) or (pat[n] = NCCL)) then
   	result := integer( pat[n+1] ) + 2;
end;

//------------------------------------------------------------------------------
// Find single character in character class.
//------------------------------------------------------------------------------

function LocateChar( c : char; const pat : string; offset : integer;
									csense : boolean ) : boolean;
var
	i : integer;
begin
	result := false;
   i := offset + integer( pat[offset] );
   while( i > offset ) do begin
   	if ( CmpChar(c, pat[i], csense ) ) then begin
      	result := true;
         exit;
      end;
      dec(i );
   end;
end;

//------------------------------------------------------------------------------
// Match a single pattern element against a string.
//------------------------------------------------------------------------------

function MatchOne( const str : string; var i : integer;
							const pat : string; j : integer;
                     csense : boolean ) : boolean;
var
	advance : integer;
begin
	advance := -1;
   if ( AtEnd( str, i ) ) then begin
   	if ( pat[j] = EOL ) then
      	advance := 0;
   end
   else if ( not (pat[j] in [LITCHAR, BOl, EOL, ANY, CCL, NCCL, CLOSURE])) then
   	raise Exception.Create( 'should never happen!' )
   else begin
   	case pat[j] of
      	LITCHAR:
         	if ( CmpChar( str[i], pat[j+1], csense ) ) then
            	advance := 1;
         BOL:
         	if ( i = 1 ) then
            	advance := 0;
         ANY:
         	if ( not AtEnd( str, i ) ) then       //i+1
            	advance := 1;
         EOL:
         	if ( AtEnd( str, i + 1 ) ) then
            	advance := 0;
         CCL:
         	if ( LocateChar( str[i], pat, j+1, csense )) then
            	advance := 1;
         NCCL:
				if ( not AtEnd( str, i + 1 )
            			and( not LocateChar( str[i], pat, j+1, csense) )) then
            	advance := 1;
      end;
   end;

   if ( advance >= 0 ) then begin
   	i := i + advance;
      result := true;
   end
   else
   	result := false;
end;

//------------------------------------------------------------------------------
// Look for a  match of patttern element pat[j] in a string, starting at offset.
//------------------------------------------------------------------------------

function MatchPat( const str : string; offset : integer;
						const pat : string; j : integer;
                  csense : boolean ) : integer;
var
	i, k : integer;
begin
	while( not AtEnd( pat, j ) ) do begin
   	if ( pat[j] = CLOSURE ) then begin
      	j := j + PatSize( pat, j );
         i := offset;
         while( (not AtEnd( str, i ))
         	and ( MatchOne( str, i, pat, j, csense ))) do begin
         	// nothing
         end;
         while( i >= offset ) do begin
         	k := MatchPat( str, i, pat, j + PatSize( pat, j ), csense);
            if ( k > 0 ) then
            	break;
            dec( i );
         end;

         offset := k;
         break;
      end
      else if ( not MatchOne( str, offset, pat, j, csense ) ) then begin
      	offset := 0;
         break;
      end
      else
      	j := j + PatSize( pat, j );
   end;
	result := offset;
end;

//------------------------------------------------------------------------------
// Look for a pattern in a string, returning the position of the pattern
// (or zero if not found, and the length.
//------------------------------------------------------------------------------

function Find( const str : string; start : integer;
							const pat : string;
							var len : integer;
								csense : boolean ) : integer;
var
	i, pos : integer;
begin
	i := start;
   pos := 0;
   result := 0;

   while ( not AtEnd( str, i )) do begin
      len := MatchPat( str, i, pat, 1, csense  );
   	if ( len <> 0 ) then begin
      	len := len - i;
      	result := i;
         exit;
      end;
      inc( i );
   end;
end;

//----------------------------------- eof --------------------------------------

end.

[Back to DELPHI SWAG index]  [Back to Main SWAG index]  [Original]