Chinaunix首页 | 论坛 | 博客
  • 博客访问: 400300
  • 博文数量: 38
  • 博客积分: 1490
  • 博客等级: 上尉
  • 技术积分: 406
  • 用 户 组: 普通用户
  • 注册时间: 2006-01-08 00:52
文章分类

全部博文(38)

文章存档

2014年(1)

2013年(1)

2008年(6)

2007年(7)

2006年(23)

我的朋友

分类:

2006-04-13 11:26:20

dunit RegExpr;
{
     TRegExpr class library
     Delphi Regular Expressions
 Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
 You may use this software in any kind of development,
 including comercial, redistribute, and modify it freely,
 under the following restrictions :
 1. This software is provided as it is, without any kind of
    warranty given. Use it at Your own risk.The author is not
    responsible for any consequences of use of this software.
 2. The origin of this software may not be mispresented, You
    must not claim that You wrote the original software. If
    You use this software in any kind of product, it would be
    appreciated that there in a information box, or in the
    documentation would be an acknowledgement like
     Partial Copyright (c) 2004 Andrey V. Sorokin
                               
                               
 3. You may not have any income from distributing this source
    (or altered version of it) to other developers. When You
    use this product in a comercial package, the source may
    not be charged seperatly.
 4. Altered versions must be plainly marked as such, and must
    not be misrepresented as being the original software.
 5. RegExp Studio application and all the visual components as
    well as documentation is not part of the TRegExpr library
    and is not free for usage.
                                   
                                   
                                   
}
interface
// ======== Determine compiler
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF D6}
  {$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF D7}
  {$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
  {$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
  {$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF FPC}
 {$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$ENDIF}
// ======== Define options for TRegExpr engine
{.$DEFINE UniCode} // Unicode support
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
{$IFNDEF FPC} // the option is not supported in FreePascal
 {$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // support braces in complex cases
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
 {$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
 {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
{$ENDIF}
// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
uses
 Classes,  // TStrings in Split method
 SysUtils; // Exception
type
 {$IFDEF UniCode}
 PRegExprChar = PWideChar;
 RegExprString = WideString;
 REChar = WideChar;
 {$ELSE}
 PRegExprChar = PChar;
 RegExprString = AnsiString; //###0.952 was string
 REChar = Char;
 {$ENDIF}
 TREOp = REChar; // internal p-code type //###0.933
 PREOp = ^TREOp;
 TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
 PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
 TREBracesArg = integer; // type of {m,n} arguments
 PREBracesArg = ^TREBracesArg;
const
 REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
 RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
 REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
type
 TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
                               of object;
const
  EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
  RegExprModifierI : boolean = False;    // default value for ModifierI
  RegExprModifierR : boolean = True;     // default value for ModifierR
  RegExprModifierS : boolean = True;     // default value for ModifierS
  RegExprModifierG : boolean = True;     // default value for ModifierG
  RegExprModifierM : boolean = False;    // default value for ModifierM
  RegExprModifierX : boolean = False;    // default value for ModifierX
  RegExprSpaceChars : RegExprString =    // default value for SpaceChars
  ' '#$9#$A#$D#$C;
  RegExprWordChars : RegExprString =     // default value for WordChars
    '0123456789' //###0.940
  + 'abcdefghijklmnopqrstuvwxyz'
  + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  RegExprLineSeparators : RegExprString =// default value for LineSeparators
   #$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
  RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
   #$d#$a;
  { if You need Unix-styled line separators (only \n), then use:
  RegExprLineSeparators = #$a;
  RegExprLinePairedSeparator = '';
  }

const
 NSUBEXP = 15; // max number of subexpression //###0.929
 // Cannot be more than NSUBEXPMAX
 // Be carefull - don't use values which overflow CLOSE opcode
 // (in this case you'll get compiler erorr).
 // Big NSUBEXP will cause more slow work and more stack required
 NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
 // Don't change it! It's defined by internal TRegExpr design.
 MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
 {$IFDEF ComplexBraces}
 LoopStackMax = 10; // max depth of loops stack //###0.925
 {$ENDIF}
 TinySetLen = 3;
 // if range includes more then TinySetLen chars, //###0.934
 // then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
 // !!! Attension ! If you change TinySetLen, you must
 // change code marked as "//!!!TinySet"

type
{$IFDEF UseSetOfChar}
 PSetOfREChar = ^TSetOfREChar;
 TSetOfREChar = set of REChar;
{$ENDIF}
 TRegExpr = class;
 TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
                               of object;
 TRegExpr = class
   private
    startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
    endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
    {$IFDEF ComplexBraces}
    LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
    LoopStackIdx : integer; // 0 - out of all loops
    {$ENDIF}
    // The "internal use only" fields to pass info from compile
    // to execute that permits the execute phase to run lots faster on
    // simple cases.
    regstart : REChar; // char that must begin a match; '\0' if none obvious
    reganch : REChar; // is the match anchored (at beginning-of-line only)?
    regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
    regmlen : integer; // length of regmust string
    // Regstart and reganch permit very fast decisions on suitable starting points
    // for a match, cutting down the work a lot.  Regmust permits fast rejection
    // of lines that cannot possibly match.  The regmust tests are costly enough
    // that regcomp() supplies a regmust only if the r.e. contains something
    // potentially expensive (at present, the only such thing detected is * or +
    // at the start of the r.e., which can involve a lot of backup).  Regmlen is
    // supplied because the test in regexec() needs it and regcomp() is computing
    // it anyway.
    {$IFDEF UseFirstCharSet} //###0.929
    FirstCharSet : TSetOfREChar;
    {$ENDIF}
    // work variables for Exec's routins - save stack in recursion}
    reginput : PRegExprChar; // String-input pointer.
    fInputStart : PRegExprChar; // Pointer to first char of input string.
    fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
    // work variables for compiler's routines
    regparse : PRegExprChar;  // Input-scan pointer.
    regnpar : integer; // count.
    regdummy : char;
    regcode : PRegExprChar;   // Code-emit pointer; @regdummy = don't.
    regsize : integer; // Code size.
    regexpbeg : PRegExprChar; // only for error handling. Contains
    // pointer to beginning of r.e. while compiling
    fExprIsCompiled : boolean; // true if r.e. successfully compiled
    // programm is essentially a linear encoding
    // of a nondeterministic finite-state machine (aka syntax charts or
    // "railroad normal form" in parsing technology).  Each node is an opcode
    // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
    // all nodes except BRANCH implement concatenation; a "next" pointer with
    // a BRANCH on both ends of it is connecting two alternatives.  (Here we
    // have one of the subtle syntax dependencies:  an individual BRANCH (as
    // opposed to a collection of them) is never concatenated with anything
    // because of operator precedence.)  The operand of some types of node is
    // a literal string; for others, it is a node leading into a sub-FSM.  In
    // particular, the operand of a BRANCH node is the first node of the branch.
    // (NB this is *not* a tree structure:  the tail of the branch connects
    // to the thing following the set of BRANCHes.)  The opcodes are:
    programm : PRegExprChar; // Unwarranted chumminess with compiler.
    fExpression : PRegExprChar; // source of compiled r.e.
    fInputString : PRegExprChar; // input string
    fLastError : integer; // see Error, LastError
    fModifiers : integer; // modifiers
    fCompModifiers : integer; // compiler's copy of modifiers
    fProgModifiers : integer; // modifiers values from last programm compilation
    fSpaceChars : RegExprString; //###0.927
    fWordChars : RegExprString; //###0.929
    fInvertCase : TRegExprInvertCaseFunction; //###0.927
    fLineSeparators : RegExprString; //###0.941
    fLinePairedSeparatorAssigned : boolean;
    fLinePairedSeparatorHead,
    fLinePairedSeparatorTail : REChar;
    {$IFNDEF UniCode}
    fLineSeparatorsSet : set of REChar;
    {$ENDIF}
    procedure InvalidateProgramm;
    // Mark programm as have to be [re]compiled
    function IsProgrammOk : boolean; //###0.941
    // Check if we can use precompiled r.e. or
    // [re]compile it if something changed
    function GetExpression : RegExprString;
    procedure SetExpression (const s : RegExprString);
    function GetModifierStr : RegExprString;
    class function ParseModifiersStr (const AModifiers : RegExprString;
      var AModifiersInt : integer) : boolean; //###0.941 class function now
    // Parse AModifiers string and return true and set AModifiersInt
    // if it's in format 'ismxrg-ismxrg'.
    procedure SetModifierStr (const AModifiers : RegExprString);
    function GetModifier (AIndex : integer) : boolean;
    procedure SetModifier (AIndex : integer; ASet : boolean);
    procedure Error (AErrorID : integer); virtual; // error handler.
    // Default handler raise exception ERegExpr with
    // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
    // and CompilerErrorPos = value of property CompilerErrorPos.

    {==================== Compiler section ===================}
    function CompileRegExpr (exp : PRegExprChar) : boolean;
    // compile a regular expression into internal code
    procedure Tail (p : PRegExprChar; val : PRegExprChar);
    // set the next-pointer at the end of a node chain
    procedure OpTail (p : PRegExprChar; val : PRegExprChar);
    // regoptail - regtail on operand of first argument; nop if operandless
    function EmitNode (op : TREOp) : PRegExprChar;
    // regnode - emit a node, return location
    procedure EmitC (b : REChar);
    // emit (if appropriate) a byte of code
    procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
    // insert an operator in front of already-emitted operand
    // Means relocating the operand.
    function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
    // regular expression, i.e. main body or parenthesized thing
    function ParseBranch (var flagp : integer) : PRegExprChar;
    // one alternative of an | operator
    function ParsePiece (var flagp : integer) : PRegExprChar;
    // something followed by possible [*+?]
    function ParseAtom (var flagp : integer) : PRegExprChar;
    // the lowest level
    function GetCompilerErrorPos : integer;
    // current pos in r.e. - for error hanling
    {$IFDEF UseFirstCharSet} //###0.929
    procedure FillFirstCharSet (prog : PRegExprChar);
    {$ENDIF}
    {===================== Mathing section ===================}
    function regrepeat (p : PRegExprChar; AMax : integer) : integer;
    // repeatedly match something simple, report how many
    function regnext (p : PRegExprChar) : PRegExprChar;
    // dig the "next" pointer out of a node
    function MatchPrim (prog : PRegExprChar) : boolean;
    // recursively matching routine
    function ExecPrim (AOffset: integer) : boolean;
    // Exec for stored InputString
    {$IFDEF RegExpPCodeDump}
    function DumpOp (op : REChar) : RegExprString;
    {$ENDIF}
    function GetSubExprMatchCount : integer;
    function GetMatchPos (Idx : integer) : integer;
    function GetMatchLen (Idx : integer) : integer;
    function GetMatch (Idx : integer) : RegExprString;
    function GetInputString : RegExprString;
    procedure SetInputString (const AInputString : RegExprString);
    {$IFNDEF UseSetOfChar}
    function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
    {$ENDIF}
    procedure SetLineSeparators (const AStr : RegExprString);
    procedure SetLinePairedSeparator (const AStr : RegExprString);
    function GetLinePairedSeparator : RegExprString;
   public
    constructor Create;
    destructor Destroy; override;
    class function VersionMajor : integer; //###0.944
    class function VersionMinor : integer; //###0.944
    property Expression : RegExprString read GetExpression write SetExpression;
    // Regular expression.
    // For optimization, TRegExpr will automatically compiles it into 'P-code'
    // (You can see it with help of Dump method) and stores in internal
    // structures. Real [re]compilation occures only when it really needed -
    // while calling Exec[Next], Substitute, Dump, etc
    // and only if Expression or other P-code affected properties was changed
    // after last [re]compilation.
    // If any errors while [re]compilation occures, Error method is called
    // (by default Error raises exception - see below)
    property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
    // Set/get default values of r.e.syntax modifiers. Modifiers in
    // r.e. (?ismx-ismx) will replace this default values.
    // If you try to set unsupported modifier, Error will be called
    // (by defaul Error raises exception ERegExpr).
    property ModifierI : boolean index 1 read GetModifier write SetModifier;
    // Modifier /i - caseinsensitive, initialized from RegExprModifierI
    property ModifierR : boolean index 2 read GetModifier write SetModifier;
    // Modifier /r - use r.e.syntax extended for russian,
    // (was property ExtSyntaxEnabled in previous versions)
    // If true, then ?
阅读(5629) | 评论(0) | 转发(0) |
0

上一篇:DES文件加密

下一篇:cron配置

给主人留下些什么吧!~~