dunit RegExpr;
{
TRegExpr class library
Delphi Regular Expressions
Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
You may use this software in any kind of development,
including comercial, redistribute, and modify it freely,
under the following restrictions :
1. This software is provided as it is, without any kind of
warranty given. Use it at Your own risk.The author is not
responsible for any consequences of use of this software.
2. The origin of this software may not be mispresented, You
must not claim that You wrote the original software. If
You use this software in any kind of product, it would be
appreciated that there in a information box, or in the
documentation would be an acknowledgement like
Partial Copyright (c) 2004 Andrey V. Sorokin
3. You may not have any income from distributing this source
(or altered version of it) to other developers. When You
use this product in a comercial package, the source may
not be charged seperatly.
4. Altered versions must be plainly marked as such, and must
not be misrepresented as being the original software.
5. RegExp Studio application and all the visual components as
well as documentation is not part of the TRegExpr library
and is not free for usage.
}
interface
// ======== Determine compiler
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF D6}
{$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF D7}
{$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
{$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
{$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF FPC}
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$ENDIF}
// ======== Define options for TRegExpr engine
{.$DEFINE UniCode} // Unicode support
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
{$IFNDEF FPC} // the option is not supported in FreePascal
{$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // support braces in complex cases
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
{$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
{$ENDIF}
// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
uses
Classes, // TStrings in Split method
SysUtils; // Exception
type
{$IFDEF UniCode}
PRegExprChar = PWideChar;
RegExprString = WideString;
REChar = WideChar;
{$ELSE}
PRegExprChar = PChar;
RegExprString = AnsiString; //###0.952 was string
REChar = Char;
{$ENDIF}
TREOp = REChar; // internal p-code type //###0.933
PREOp = ^TREOp;
TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
TREBracesArg = integer; // type of {m,n} arguments
PREBracesArg = ^TREBracesArg;
const
REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
type
TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
of object;
const
EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
RegExprModifierI : boolean = False; // default value for ModifierI
RegExprModifierR : boolean = True; // default value for ModifierR
RegExprModifierS : boolean = True; // default value for ModifierS
RegExprModifierG : boolean = True; // default value for ModifierG
RegExprModifierM : boolean = False; // default value for ModifierM
RegExprModifierX : boolean = False; // default value for ModifierX
RegExprSpaceChars : RegExprString = // default value for SpaceChars
' '#$9#$A#$D#$C;
RegExprWordChars : RegExprString = // default value for WordChars
'0123456789' //###0.940
+ 'abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
RegExprLineSeparators : RegExprString =// default value for LineSeparators
#$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
#$d#$a;
{ if You need Unix-styled line separators (only \n), then use:
RegExprLineSeparators = #$a;
RegExprLinePairedSeparator = '';
}
const
NSUBEXP = 15; // max number of subexpression //###0.929
// Cannot be more than NSUBEXPMAX
// Be carefull - don't use values which overflow CLOSE opcode
// (in this case you'll get compiler erorr).
// Big NSUBEXP will cause more slow work and more stack required
NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
// Don't change it! It's defined by internal TRegExpr design.
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
{$IFDEF ComplexBraces}
LoopStackMax = 10; // max depth of loops stack //###0.925
{$ENDIF}
TinySetLen = 3;
// if range includes more then TinySetLen chars, //###0.934
// then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
// !!! Attension ! If you change TinySetLen, you must
// change code marked as "//!!!TinySet"
type
{$IFDEF UseSetOfChar}
PSetOfREChar = ^TSetOfREChar;
TSetOfREChar = set of REChar;
{$ENDIF}
TRegExpr = class;
TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
of object;
TRegExpr = class
private
startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
{$IFDEF ComplexBraces}
LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
LoopStackIdx : integer; // 0 - out of all loops
{$ENDIF}
// The "internal use only" fields to pass info from compile
// to execute that permits the execute phase to run lots faster on
// simple cases.
regstart : REChar; // char that must begin a match; '\0' if none obvious
reganch : REChar; // is the match anchored (at beginning-of-line only)?
regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
regmlen : integer; // length of regmust string
// Regstart and reganch permit very fast decisions on suitable starting points
// for a match, cutting down the work a lot. Regmust permits fast rejection
// of lines that cannot possibly match. The regmust tests are costly enough
// that regcomp() supplies a regmust only if the r.e. contains something
// potentially expensive (at present, the only such thing detected is * or +
// at the start of the r.e., which can involve a lot of backup). Regmlen is
// supplied because the test in regexec() needs it and regcomp() is computing
// it anyway.
{$IFDEF UseFirstCharSet} //###0.929
FirstCharSet : TSetOfREChar;
{$ENDIF}
// work variables for Exec's routins - save stack in recursion}
reginput : PRegExprChar; // String-input pointer.
fInputStart : PRegExprChar; // Pointer to first char of input string.
fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
// work variables for compiler's routines
regparse : PRegExprChar; // Input-scan pointer.
regnpar : integer; // count.
regdummy : char;
regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
regsize : integer; // Code size.
regexpbeg : PRegExprChar; // only for error handling. Contains
// pointer to beginning of r.e. while compiling
fExprIsCompiled : boolean; // true if r.e. successfully compiled
// programm is essentially a linear encoding
// of a nondeterministic finite-state machine (aka syntax charts or
// "railroad normal form" in parsing technology). Each node is an opcode
// plus a "next" pointer, possibly plus an operand. "Next" pointers of
// all nodes except BRANCH implement concatenation; a "next" pointer with
// a BRANCH on both ends of it is connecting two alternatives. (Here we
// have one of the subtle syntax dependencies: an individual BRANCH (as
// opposed to a collection of them) is never concatenated with anything
// because of operator precedence.) The operand of some types of node is
// a literal string; for others, it is a node leading into a sub-FSM. In
// particular, the operand of a BRANCH node is the first node of the branch.
// (NB this is *not* a tree structure: the tail of the branch connects
// to the thing following the set of BRANCHes.) The opcodes are:
programm : PRegExprChar; // Unwarranted chumminess with compiler.
fExpression : PRegExprChar; // source of compiled r.e.
fInputString : PRegExprChar; // input string
fLastError : integer; // see Error, LastError
fModifiers : integer; // modifiers
fCompModifiers : integer; // compiler's copy of modifiers
fProgModifiers : integer; // modifiers values from last programm compilation
fSpaceChars : RegExprString; //###0.927
fWordChars : RegExprString; //###0.929
fInvertCase : TRegExprInvertCaseFunction; //###0.927
fLineSeparators : RegExprString; //###0.941
fLinePairedSeparatorAssigned : boolean;
fLinePairedSeparatorHead,
fLinePairedSeparatorTail : REChar;
{$IFNDEF UniCode}
fLineSeparatorsSet : set of REChar;
{$ENDIF}
procedure InvalidateProgramm;
// Mark programm as have to be [re]compiled
function IsProgrammOk : boolean; //###0.941
// Check if we can use precompiled r.e. or
// [re]compile it if something changed
function GetExpression : RegExprString;
procedure SetExpression (const s : RegExprString);
function GetModifierStr : RegExprString;
class function ParseModifiersStr (const AModifiers : RegExprString;
var AModifiersInt : integer) : boolean; //###0.941 class function now
// Parse AModifiers string and return true and set AModifiersInt
// if it's in format 'ismxrg-ismxrg'.
procedure SetModifierStr (const AModifiers : RegExprString);
function GetModifier (AIndex : integer) : boolean;
procedure SetModifier (AIndex : integer; ASet : boolean);
procedure Error (AErrorID : integer); virtual; // error handler.
// Default handler raise exception ERegExpr with
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
// and CompilerErrorPos = value of property CompilerErrorPos.
{==================== Compiler section ===================}
function CompileRegExpr (exp : PRegExprChar) : boolean;
// compile a regular expression into internal code
procedure Tail (p : PRegExprChar; val : PRegExprChar);
// set the next-pointer at the end of a node chain
procedure OpTail (p : PRegExprChar; val : PRegExprChar);
// regoptail - regtail on operand of first argument; nop if operandless
function EmitNode (op : TREOp) : PRegExprChar;
// regnode - emit a node, return location
procedure EmitC (b : REChar);
// emit (if appropriate) a byte of code
procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
// insert an operator in front of already-emitted operand
// Means relocating the operand.
function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
// regular expression, i.e. main body or parenthesized thing
function ParseBranch (var flagp : integer) : PRegExprChar;
// one alternative of an | operator
function ParsePiece (var flagp : integer) : PRegExprChar;
// something followed by possible [*+?]
function ParseAtom (var flagp : integer) : PRegExprChar;
// the lowest level
function GetCompilerErrorPos : integer;
// current pos in r.e. - for error hanling
{$IFDEF UseFirstCharSet} //###0.929
procedure FillFirstCharSet (prog : PRegExprChar);
{$ENDIF}
{===================== Mathing section ===================}
function regrepeat (p : PRegExprChar; AMax : integer) : integer;
// repeatedly match something simple, report how many
function regnext (p : PRegExprChar) : PRegExprChar;
// dig the "next" pointer out of a node
function MatchPrim (prog : PRegExprChar) : boolean;
// recursively matching routine
function ExecPrim (AOffset: integer) : boolean;
// Exec for stored InputString
{$IFDEF RegExpPCodeDump}
function DumpOp (op : REChar) : RegExprString;
{$ENDIF}
function GetSubExprMatchCount : integer;
function GetMatchPos (Idx : integer) : integer;
function GetMatchLen (Idx : integer) : integer;
function GetMatch (Idx : integer) : RegExprString;
function GetInputString : RegExprString;
procedure SetInputString (const AInputString : RegExprString);
{$IFNDEF UseSetOfChar}
function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
{$ENDIF}
procedure SetLineSeparators (const AStr : RegExprString);
procedure SetLinePairedSeparator (const AStr : RegExprString);
function GetLinePairedSeparator : RegExprString;
public
constructor Create;
destructor Destroy; override;
class function VersionMajor : integer; //###0.944
class function VersionMinor : integer; //###0.944
property Expression : RegExprString read GetExpression write SetExpression;
// Regular expression.
// For optimization, TRegExpr will automatically compiles it into 'P-code'
// (You can see it with help of Dump method) and stores in internal
// structures. Real [re]compilation occures only when it really needed -
// while calling Exec[Next], Substitute, Dump, etc
// and only if Expression or other P-code affected properties was changed
// after last [re]compilation.
// If any errors while [re]compilation occures, Error method is called
// (by default Error raises exception - see below)
property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
// Set/get default values of r.e.syntax modifiers. Modifiers in
// r.e. (?ismx-ismx) will replace this default values.
// If you try to set unsupported modifier, Error will be called
// (by defaul Error raises exception ERegExpr).
property ModifierI : boolean index 1 read GetModifier write SetModifier;
// Modifier /i - caseinsensitive, initialized from RegExprModifierI
property ModifierR : boolean index 2 read GetModifier write SetModifier;
// Modifier /r - use r.e.syntax extended for russian,
// (was property ExtSyntaxEnabled in previous versions)
// If true, then ?
阅读(5673) | 评论(0) | 转发(0) |