// This is a part of the Active Template Library. // Copyright (C) Microsoft Corporation // All rights reserved. // // This source code is only intended as a supplement to the // Active Template Library Reference and related // electronic documentation provided with the library. // See these sources for detailed information regarding the // Active Template Library product. #ifndef __ATLRX_H__ #define __ATLRX_H__ #pragma once #include #include #include #ifndef ATL_REGEXP_MIN_STACK #define ATL_REGEXP_MIN_STACK 256 #endif /* Regular Expression Grammar R - top level grammar rule RE - regular expression AltE - Alternative expression E - expression SE - simple expression R -> RE '^'RE (matches begining of string) RE -> AltE RE AltE AltE -> E E '|' AltE E -> SE (RepeatOp '?'?)? SE -> Arg Group CharClass '\'Abbrev (see below) '\'EscapedChar (any character including reserved symbols) '\'Digit+ (Arg back reference) '!' (not) '.' (any char) '$' (end of input) Symbol (any non-reserved character) Arg -> '{'RE'}' Group -> '('RE')' CharClass -> '[' '^'? CharSet ']' CharSet -> CharItem+ CharItem -> Char('-'Char)? RepeatOp -> '*' '+' '?' Abbrev -> Abbreviation defined in CAtlRECharTraits Abbrev Expansion Meaning a ([a-zA-Z0-9]) alpha numeric b ([ \\t]) white space (blank) c ([a-zA-Z]) alpha d ([0-9]) digit h ([0-9a-fA-F]) hex digit n (\r|(\r?\n)) newline q (\"[^\"]*\")|(\'[^\']*\') quoted string w ([a-zA-Z]+) simple word z ([0-9]+) integer */ #pragma pack(push,_ATL_PACKING) namespace ATL { //Convertion utility classes used to convert char* to RECHAR. //Used by rx debugging printing. template class CAToREChar { public: CAToREChar(const char* psz) throw() : m_psz(psz) { } operator const RECHARTYPE*() const throw() { return m_psz; } const char* m_psz; }; template<> class CAToREChar { public: CAToREChar(const char* psz) throw() : m_a2w(psz) { } operator const wchar_t*() const throw() { return (wchar_t*)m_a2w; } private: CA2W m_a2w; }; class CAtlRECharTraitsA { public: typedef char RECHARTYPE; static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw() { #ifndef ATL_NO_CHECK_BIT_FIELD ATLASSERT(UseBitFieldForRange()); #endif return static_cast(static_cast(*sz)); } static RECHARTYPE *Next(const RECHARTYPE *sz) throw() { return (RECHARTYPE *) (sz+1); } static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return strncmp(szLeft, szRight, nCount); } static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return _strnicmp(szLeft, szRight, nCount); } _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsA::Strlwr must be passed a buffer size.") static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw() { #pragma warning (push) #pragma warning(disable : 4996) return _strlwr(sz); #pragma warning (pop) } static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw() { Checked::strlwr_s(sz, nSize); return sz; } static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw() { return strtol(sz, szEnd, nBase); } static int Isdigit(RECHARTYPE ch) throw() { return isdigit(static_cast(ch)); } static const RECHARTYPE** GetAbbrevs() { static const RECHARTYPE *s_szAbbrevs[] = { "a([a-zA-Z0-9])", // alpha numeric "b([ \\t])", // white space (blank) "c([a-zA-Z])", // alpha "d([0-9])", // digit "h([0-9a-fA-F])", // hex digit "n(\r|(\r?\n))", // newline "q(\"[^\"]*\")|(\'[^\']*\')", // quoted string "w([a-zA-Z]+)", // simple word "z([0-9]+)", // integer NULL }; return s_szAbbrevs; } static BOOL UseBitFieldForRange() throw() { return TRUE; } static int ByteLen(const RECHARTYPE *sz) throw() { return int(strlen(sz)); } }; class CAtlRECharTraitsW { public: typedef WCHAR RECHARTYPE; static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw() { #ifndef ATL_NO_CHECK_BIT_FIELD ATLASSERT(UseBitFieldForRange()); #endif return static_cast(*sz); } static RECHARTYPE *Next(const RECHARTYPE *sz) throw() { return (RECHARTYPE *) (sz+1); } static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return wcsncmp(szLeft, szRight, nCount); } static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return _wcsnicmp(szLeft, szRight, nCount); } _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsW::Strlwr must be passed a buffer size.") static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw() { #pragma warning (push) #pragma warning(disable : 4996) return _wcslwr(sz); #pragma warning (pop) } static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw() { Checked::wcslwr_s(sz, nSize); return sz; } static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw() { return wcstol(sz, szEnd, nBase); } static int Isdigit(RECHARTYPE ch) throw() { return iswdigit(ch); } static const RECHARTYPE** GetAbbrevs() { static const RECHARTYPE *s_szAbbrevs[] = { L"a([a-zA-Z0-9])", // alpha numeric L"b([ \\t])", // white space (blank) L"c([a-zA-Z])", // alpha L"d([0-9])", // digit L"h([0-9a-fA-F])", // hex digit L"n(\r|(\r?\n))", // newline L"q(\"[^\"]*\")|(\'[^\']*\')", // quoted string L"w([a-zA-Z]+)", // simple word L"z([0-9]+)", // integer NULL }; return s_szAbbrevs; } static BOOL UseBitFieldForRange() throw() { return FALSE; } static int ByteLen(const RECHARTYPE *sz) throw() { return int(wcslen(sz)*sizeof(WCHAR)); } }; class CAtlRECharTraitsMB { public: typedef unsigned char RECHARTYPE; static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw() { #ifndef ATL_NO_CHECK_BIT_FIELD ATLASSERT(UseBitFieldForRange()); #endif return static_cast(*sz); } static RECHARTYPE *Next(const RECHARTYPE *sz) throw() { return _mbsinc(sz); } static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return _mbsncmp(szLeft, szRight, nCount); } static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw() { return _mbsnicmp(szLeft, szRight, nCount); } _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsMB::Strlwr must be passed a buffer size.") static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw() { #pragma warning (push) #pragma warning(disable : 4996) return _mbslwr(sz); #pragma warning (pop) } static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw() { Checked::mbslwr_s(sz, nSize); return sz; } static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw() { return strtol((const char *) sz, (char **) szEnd, nBase); } static int Isdigit(RECHARTYPE ch) throw() { return _ismbcdigit((unsigned int) ch); } static const RECHARTYPE** GetAbbrevs() { return reinterpret_cast(CAtlRECharTraitsA::GetAbbrevs()); } static BOOL UseBitFieldForRange() throw() { return FALSE; } static int ByteLen(const RECHARTYPE *sz) throw() { return (int)strlen((const char *) sz); } }; #ifndef _UNICODE typedef CAtlRECharTraitsA CAtlRECharTraits; #else // _UNICODE typedef CAtlRECharTraitsW CAtlRECharTraits; #endif // !_UNICODE // Note: If you want to use CAtlRECharTraitsMB you must pass it in // as a template argument template class CAtlRegExp; // forward declaration template class CAtlREMatchContext { public: friend CAtlRegExp; typedef typename CharTraits::RECHARTYPE RECHAR; struct MatchGroup { const RECHAR *szStart; const RECHAR *szEnd; }; UINT m_uNumGroups; MatchGroup m_Match; void GetMatch(UINT nIndex, const RECHAR **szStart, const RECHAR **szEnd) { ATLENSURE(szStart != NULL); ATLENSURE(szEnd != NULL); ATLENSURE(nIndex >=0 && nIndex < m_uNumGroups); *szStart = m_Matches[nIndex].szStart; *szEnd = m_Matches[nIndex].szEnd; } void GetMatch(UINT nIndex, MatchGroup *pGroup) { ATLENSURE(pGroup != NULL); ATLENSURE(nIndex >=0&&(static_cast(nIndex))< m_uNumGroups); pGroup->szStart = m_Matches[nIndex].szStart; pGroup->szEnd = m_Matches[nIndex].szEnd; } protected: CAutoVectorPtr m_Mem; CAutoVectorPtr m_Matches; CAtlArray m_stack; size_t m_nTos; public: CAtlREMatchContext(size_t nInitStackSize=ATL_REGEXP_MIN_STACK) { m_uNumGroups = 0; m_nTos = 0; m_stack.SetCount(nInitStackSize); m_Match.szStart = NULL; m_Match.szEnd = NULL; } protected: BOOL Initialize(UINT uRequiredMem, UINT uNumGroups) throw() { m_nTos = 0; m_uNumGroups = 0; m_Matches.Free(); if (!m_Matches.Allocate(uNumGroups)) return FALSE; m_uNumGroups = uNumGroups; m_Mem.Free(); if (!m_Mem.Allocate(uRequiredMem)) return FALSE; memset(m_Mem.m_p, 0x00, uRequiredMem*sizeof(void *)); memset(m_Matches, 0x00, m_uNumGroups * sizeof(MatchGroup)); return TRUE; } BOOL Push(void *p) { m_nTos++; if (m_stack.GetCount() <= (UINT) m_nTos) { if (!m_stack.SetCount((m_nTos+1)*2)) { m_nTos--; return FALSE; } } m_stack[m_nTos] = p; return TRUE; } BOOL Push(size_t n) { return Push((void *) n); } void *Pop() throw() { if (m_nTos==0) { // stack underflow // this should never happen at match time. // (the parsing succeeded when it shouldn't have) ATLASSERT(FALSE); return NULL; } void *p = m_stack[m_nTos]; m_nTos--; return p; } }; enum REParseError { REPARSE_ERROR_OK = 0, // No error occurred REPARSE_ERROR_OUTOFMEMORY, // Out of memory REPARSE_ERROR_BRACE_EXPECTED, // A closing brace was expected REPARSE_ERROR_PAREN_EXPECTED, // A closing parenthesis was expected REPARSE_ERROR_BRACKET_EXPECTED, // A closing bracket was expected REPARSE_ERROR_UNEXPECTED, // An unspecified fatal error occurred REPARSE_ERROR_EMPTY_RANGE, // A range expression was empty REPARSE_ERROR_INVALID_GROUP, // A backreference was made to a group // that did not exist REPARSE_ERROR_INVALID_RANGE, // An invalid range was specified REPARSE_ERROR_EMPTY_REPEATOP, // A possibly empty * or + was detected REPARSE_ERROR_INVALID_INPUT, // The input string was invalid }; template class CAtlRegExp { public: CAtlRegExp() throw() { m_uNumGroups = 0; m_uRequiredMem = 0; m_bCaseSensitive = TRUE; m_LastError = REPARSE_ERROR_OK; } typedef typename CharTraits::RECHARTYPE RECHAR; // CAtlRegExp::Parse // Parses the regular expression // returns REPARSE_ERROR_OK if successful, an REParseError otherwise REParseError Parse(const RECHAR *szRE, BOOL bCaseSensitive=TRUE) { ATLASSERT(szRE); if (!szRE) return REPARSE_ERROR_INVALID_INPUT; Reset(); m_bCaseSensitive = bCaseSensitive; const RECHAR *szInput = szRE; if (!bCaseSensitive) { // copy the string int nSize = CharTraits::ByteLen(szRE)+sizeof(RECHAR); szInput = (const RECHAR *) malloc(nSize); if (!szInput) return REPARSE_ERROR_OUTOFMEMORY; Checked::memcpy_s((char *) szInput, nSize, szRE, nSize); CharTraits::Strlwr(const_cast(szInput), nSize/sizeof(RECHAR)); } const RECHAR *sz = szInput; int nCall = AddInstruction(RE_CALL); if (nCall < 0) return REPARSE_ERROR_OUTOFMEMORY; if (*sz == '^') { if (AddInstruction(RE_FAIL) < 0) return REPARSE_ERROR_OUTOFMEMORY; sz++; } else { if (AddInstruction(RE_ADVANCE) < 0) return REPARSE_ERROR_OUTOFMEMORY; } bool bEmpty = true; ParseRE(&sz, bEmpty); if (!GetLastParseError()) { GetInstruction(nCall).call.nTarget = 2; if (AddInstruction(RE_MATCH) < 0) return REPARSE_ERROR_OUTOFMEMORY; } if (szInput != szRE) free((void *) szInput); return GetLastParseError(); } BOOL Match(const RECHAR *szIn, CAtlREMatchContext *pContext, const RECHAR **ppszEnd=NULL) { ATLASSERT(szIn); ATLASSERT(pContext); if (!szIn || !pContext) return FALSE; if (ppszEnd) *ppszEnd = NULL; const RECHAR *szInput = szIn; if (!m_bCaseSensitive) { int nSize = CharTraits::ByteLen(szIn)+sizeof(RECHAR); szInput = (const RECHAR *) malloc(nSize); if (!szInput) return FALSE; Checked::memcpy_s((char *) szInput, nSize, szIn, nSize); CharTraits::Strlwr(const_cast(szInput), nSize/sizeof(RECHAR)); } if (!pContext->Initialize(m_uRequiredMem, m_uNumGroups)) { if (szInput != szIn) free((void *) szInput); return FALSE; } size_t ip = 0; const RECHAR *sz = szInput; const RECHAR *szCurrInput = szInput; #pragma warning(push) #pragma warning(disable:4127) // conditional expression is constant while (1) { #ifdef ATLRX_DEBUG OnDebugEvent(ip, szInput, sz, pContext); #endif if (ip == 0) pContext->m_Match.szStart = sz; switch (GetInstruction(ip).type) { case RE_NOP: ip++; break; case RE_SYMBOL: if (GetInstruction(ip).symbol.nSymbol == static_cast(*sz)) { sz = CharTraits::Next(sz); ip++; } else { ip = (size_t) pContext->Pop(); } break; case RE_ANY: if (*sz) { sz = CharTraits::Next(sz); ip++; } else { ip = (size_t) pContext->Pop(); } break; case RE_GROUP_START: pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = sz; ip++; break; case RE_GROUP_END: pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = sz; ip++; break; case RE_PUSH_CHARPOS: pContext->Push((void *) sz); ip++; break; case RE_POP_CHARPOS: sz = (RECHAR *) pContext->Pop(); ip++; break; case RE_CALL: pContext->Push(ip+1); ip = GetInstruction(ip).call.nTarget; break; case RE_JMP: ip = GetInstruction(ip).jmp.nTarget; break; case RE_RETURN: ip = (size_t) pContext->Pop(); break; case RE_PUSH_MEMORY: pContext->Push((void *) (pContext->m_Mem[GetInstruction(ip).memory.nIndex])); ip++; break; case RE_POP_MEMORY: pContext->m_Mem[GetInstruction(ip).memory.nIndex] = pContext->Pop(); ip++; break; case RE_STORE_CHARPOS: pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) sz; ip++; break; case RE_GET_CHARPOS: sz = (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex]; ip++; break; case RE_STORE_STACKPOS: pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) pContext->m_nTos; ip++; break; case RE_GET_STACKPOS: pContext->m_nTos = (size_t) pContext->m_Mem[GetInstruction(ip).memory.nIndex]; ip++; break; case RE_RET_NOMATCH: if (sz == (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex]) { // do a return ip = (size_t) pContext->Pop(); } else ip++; break; case RE_ADVANCE: sz = CharTraits::Next(szCurrInput); szCurrInput = sz; if (*sz == '\0') goto Error; ip = 0; pContext->m_nTos = 0; break; case RE_FAIL: goto Error; case RE_RANGE: { if (*sz == '\0') { ip = (size_t) pContext->Pop(); break; } RECHAR *pBits = reinterpret_cast((&m_Instructions[ip]+1)); size_t u = CharTraits::GetBitFieldForRangeArrayIndex(sz); if (pBits[u >> 3] & 1 << (u & 0x7)) { ip += InstructionsPerRangeBitField(); ip++; sz = CharTraits::Next(sz); } else { ip = (size_t) pContext->Pop(); } } break; case RE_NOTRANGE: { if (*sz == '\0') { ip = (size_t) pContext->Pop(); break; } RECHAR *pBits = reinterpret_cast((&m_Instructions[ip]+1)); size_t u = static_cast(* ((RECHAR *) sz)); if (pBits[u >> 3] & 1 << (u & 0x7)) { ip = (size_t) pContext->Pop(); } else { ip += InstructionsPerRangeBitField(); ip++; sz = CharTraits::Next(sz); } } break; case RE_RANGE_EX: { if (*sz == '\0') { ip = (size_t) pContext->Pop(); break; } BOOL bMatch = FALSE; size_t inEnd = GetInstruction(ip).range.nTarget; ip++; while (ip < inEnd) { if (static_cast(*sz) >= GetInstruction(ip).memory.nIndex && static_cast(*sz) <= GetInstruction(ip+1).memory.nIndex) { // if we match, we jump to the end sz = CharTraits::Next(sz); ip = inEnd; bMatch = TRUE; } else { ip += 2; } } if (!bMatch) { ip = (size_t) pContext->Pop(); } } break; case RE_NOTRANGE_EX: { if (*sz == '\0') { ip = (size_t) pContext->Pop(); break; } BOOL bMatch = TRUE; size_t inEnd = GetInstruction(ip).range.nTarget; ip++; while (ip < inEnd) { if (static_cast(*sz) >= GetInstruction(ip).memory.nIndex && static_cast(*sz) <= GetInstruction(ip+1).memory.nIndex) { ip = (size_t) pContext->Pop(); bMatch = FALSE; break; } else { // if we match, we jump to the end ip += 2; } } if (bMatch) sz = CharTraits::Next(sz); } break; case RE_PREVIOUS: { BOOL bMatch = FALSE; if (m_bCaseSensitive) { bMatch = !CharTraits::Strncmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart); } else { bMatch = !CharTraits::Strnicmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart); } if (bMatch) { sz += pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart; ip++; break; } ip = (size_t) pContext->Pop(); } break; case RE_MATCH: pContext->m_Match.szEnd = sz; if (!m_bCaseSensitive) FixupMatchContext(pContext, szIn, szInput); if (ppszEnd) *ppszEnd = szIn + (sz - szInput); if (szInput != szIn) free((void *) szInput); return TRUE; break; case RE_PUSH_GROUP: pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart); pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd); ip++; break; case RE_POP_GROUP: pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = (const RECHAR *) pContext->Pop(); pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = (const RECHAR *) pContext->Pop(); ip++; break; default: ATLASSERT(FALSE); break; } } #pragma warning(pop) // 4127 ATLASSERT(FALSE); Error: pContext->m_Match.szEnd = sz; if (!m_bCaseSensitive) FixupMatchContext(pContext, szIn, szInput); if (ppszEnd) *ppszEnd = szIn + (sz - szInput); if (szInput != szIn) free((void *) szInput); return FALSE; } protected: REParseError m_LastError; REParseError GetLastParseError() throw() { return m_LastError; } void SetLastParseError(REParseError Error) throw() { m_LastError = Error; } // CAtlRegExp::Reset // Removes all instructions to allow reparsing into the same instance void Reset() throw() { m_Instructions.RemoveAll(); m_uRequiredMem = 0; m_bCaseSensitive = TRUE; m_uNumGroups = 0; SetLastParseError(REPARSE_ERROR_OK); } enum REInstructionType { RE_NOP, RE_GROUP_START, RE_GROUP_END, RE_SYMBOL, RE_ANY, RE_RANGE, RE_NOTRANGE, RE_RANGE_EX, RE_NOTRANGE_EX, RE_PLUS, RE_NG_PLUS, RE_QUESTION, RE_NG_QUESTION, RE_JMP, RE_PUSH_CHARPOS, RE_POP_CHARPOS, RE_CALL, RE_RETURN, RE_STAR_BEGIN, RE_NG_STAR_BEGIN, RE_PUSH_MEMORY, RE_POP_MEMORY, RE_STORE_CHARPOS, RE_STORE_STACKPOS, RE_GET_CHARPOS, RE_GET_STACKPOS, RE_RET_NOMATCH, RE_PREVIOUS, RE_FAIL, RE_ADVANCE, RE_MATCH, RE_PUSH_GROUP, RE_POP_GROUP, }; struct INSTRUCTION_SYMBOL { size_t nSymbol; }; struct INSTRUCTION_JMP { size_t nTarget; }; struct INSTRUCTION_GROUP { size_t nGroup; }; struct INSTRUCTION_CALL { size_t nTarget; }; struct INSTRUCTION_MEMORY { size_t nIndex; }; struct INSTRUCTION_PREVIOUS { size_t nGroup; }; struct INSTRUCTION_RANGE_EX { size_t nTarget; }; struct INSTRUCTION { REInstructionType type; union { INSTRUCTION_SYMBOL symbol; INSTRUCTION_JMP jmp; INSTRUCTION_GROUP group; INSTRUCTION_CALL call; INSTRUCTION_MEMORY memory; INSTRUCTION_PREVIOUS prev; INSTRUCTION_RANGE_EX range; }; }; inline int InstructionsPerRangeBitField() throw() { return (256/8) / sizeof(INSTRUCTION) + (((256/8) % sizeof(INSTRUCTION)) ? 1 : 0); } CAtlArray m_Instructions; UINT m_uNumGroups; UINT m_uRequiredMem; BOOL m_bCaseSensitive; // class used internally to restore // parsing state when unwinding class CParseState { public: int m_nNumInstructions; UINT m_uNumGroups; UINT m_uRequiredMem; CParseState(CAtlRegExp *pRegExp) throw() { m_nNumInstructions = (int) pRegExp->m_Instructions.GetCount(); m_uNumGroups = pRegExp->m_uNumGroups; m_uRequiredMem = pRegExp->m_uRequiredMem; } void Restore(CAtlRegExp *pRegExp) { pRegExp->m_Instructions.SetCount(m_nNumInstructions); pRegExp->m_uNumGroups = m_uNumGroups; pRegExp->m_uRequiredMem = m_uRequiredMem; } }; int AddInstruction(REInstructionType type) { if (!m_Instructions.SetCount(m_Instructions.GetCount()+1)) { SetLastParseError(REPARSE_ERROR_OUTOFMEMORY); return -1; } m_Instructions[m_Instructions.GetCount()-1].type = type; return (int) m_Instructions.GetCount()-1; } BOOL PeekToken(const RECHAR **ppszRE, int ch) throw() { if (**ppszRE != ch) return FALSE; return TRUE; } BOOL MatchToken(const RECHAR **ppszRE, int ch) throw() { if (!PeekToken(ppszRE, ch)) return FALSE; *ppszRE = CharTraits::Next(*ppszRE); return TRUE; } INSTRUCTION &GetInstruction(size_t nIndex) throw() { return m_Instructions[nIndex]; } // ParseArg: parse grammar rule Arg int ParseArg(const RECHAR **ppszRE, bool &bEmpty) { int nPushGroup = AddInstruction(RE_PUSH_GROUP); if (nPushGroup < 0) return -1; GetInstruction(nPushGroup).group.nGroup = m_uNumGroups; int p = AddInstruction(RE_GROUP_START); if (p < 0) return -1; GetInstruction(p).group.nGroup = m_uNumGroups++; int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; int nPopGroup = AddInstruction(RE_POP_GROUP); if (nPopGroup < 0) return -1; GetInstruction(nPopGroup).group.nGroup = GetInstruction(nPushGroup).group.nGroup; if (AddInstruction(RE_RETURN) < 0) return -1; int nAlt = ParseRE(ppszRE, bEmpty); if (nAlt < 0) { if (GetLastParseError()) return -1; if (!PeekToken(ppszRE, '}')) { SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED); return -1; } // in the case of an empty group, we add a nop nAlt = AddInstruction(RE_NOP); if (nAlt < 0) return -1; } GetInstruction(nCall).call.nTarget = nAlt; if (!MatchToken(ppszRE, '}')) { SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED); return -1; } int nEnd = AddInstruction(RE_GROUP_END); if (nEnd < 0) return -1; GetInstruction(nEnd).group.nGroup = GetInstruction(p).group.nGroup; return nPushGroup; } // ParseGroup: parse grammar rule Group int ParseGroup(const RECHAR **ppszRE, bool &bEmpty) { int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; if (AddInstruction(RE_RETURN) < 0) return -1; int nAlt = ParseRE(ppszRE, bEmpty); if (nAlt < 0) { if (GetLastParseError()) return -1; if (!PeekToken(ppszRE, ')')) { SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED); return -1; } // in the case of an empty group, we add a nop nAlt = AddInstruction(RE_NOP); if (nAlt < 0) return -1; } GetInstruction(nCall).call.nTarget = nAlt; if (!MatchToken(ppszRE, ')')) { SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED); return -1; } return nCall; } RECHAR GetEscapedChar(RECHAR ch) throw() { if (ch == 't') return '\t'; return ch; } // ParseCharItem: parse grammar rule CharItem int ParseCharItem(const RECHAR **ppszRE, RECHAR *pchStartChar, RECHAR *pchEndChar) throw() { if (**ppszRE == '\\') { *ppszRE = CharTraits::Next(*ppszRE); *pchStartChar = GetEscapedChar(**ppszRE); } else *pchStartChar = **ppszRE; *ppszRE = CharTraits::Next(*ppszRE); if (!MatchToken(ppszRE, '-')) { *pchEndChar = *pchStartChar; return 0; } // check for unterminated range if (!**ppszRE || PeekToken(ppszRE, ']')) { SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED); return -1; } *pchEndChar = **ppszRE; *ppszRE = CharTraits::Next(*ppszRE); if (*pchEndChar < *pchStartChar) { SetLastParseError(REPARSE_ERROR_INVALID_RANGE); return -1; } return 0; } int AddInstructions(int nNumInstructions) { size_t nCurr = m_Instructions.GetCount(); if (!m_Instructions.SetCount(nCurr+nNumInstructions)) { SetLastParseError(REPARSE_ERROR_OUTOFMEMORY); return -1; } return (int) nCurr; } // ParseCharSet: parse grammar rule CharSet int ParseCharSet(const RECHAR **ppszRE, BOOL bNot) { int p = -1; unsigned char *pBits = NULL; if (CharTraits::UseBitFieldForRange()) { // we use a bit field to represent the characters // a 1 bit means match against the character // the last 5 bits are used as an index into // the byte array, and the first 3 bits // are used to index into the selected byte p = AddInstruction(bNot ? RE_NOTRANGE : RE_RANGE); if (p < 0) return -1; // add the required space to hold the character // set. We use one bit per character for ansi if (AddInstructions(InstructionsPerRangeBitField()) < 0) return -1; pBits = (unsigned char *) (&m_Instructions[p+1]); memset(pBits, 0x00, 256/8); } else { p = AddInstruction(bNot ? RE_NOTRANGE_EX : RE_RANGE_EX); if (p < 0) return -1; } RECHAR chStart; RECHAR chEnd; while (**ppszRE && **ppszRE != ']') { if (ParseCharItem(ppszRE, &chStart, &chEnd)) return -1; if (CharTraits::UseBitFieldForRange()) { for (int i=chStart; i<=chEnd; i++) pBits[i >> 3] |= 1 << (i & 0x7); } else { int nStart = AddInstruction(RE_NOP); if (nStart < 0) return -1; int nEnd = AddInstruction(RE_NOP); if (nEnd < 0) return -1; GetInstruction(nStart).memory.nIndex = (int) chStart; GetInstruction(nEnd).memory.nIndex = (int) chEnd; } } if (!CharTraits::UseBitFieldForRange()) GetInstruction(p).range.nTarget = m_Instructions.GetCount(); return p; } // ParseCharClass: parse grammar rule CharClass int ParseCharClass(const RECHAR **ppszRE, bool &bEmpty) { bEmpty = false; if (MatchToken(ppszRE, ']')) { SetLastParseError(REPARSE_ERROR_EMPTY_RANGE); return -1; } BOOL bNot = FALSE; if (MatchToken(ppszRE, '^')) bNot = TRUE; if (MatchToken(ppszRE, ']')) { SetLastParseError(REPARSE_ERROR_EMPTY_RANGE); return -1; } int p = ParseCharSet(ppszRE, bNot); if (p < 0) return p; if (!MatchToken(ppszRE, ']')) { SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED); return -1; } return p; } int AddMemInstruction(REInstructionType type) { int p = AddInstruction(type); if (p < 0) return p; GetInstruction(p).memory.nIndex = m_uRequiredMem++; return p; } // helper for parsing !SE int ParseNot(const RECHAR **ppszRE, bool &bEmpty) { int nStoreCP = AddMemInstruction(RE_STORE_CHARPOS); int nStoreSP = AddMemInstruction(RE_STORE_STACKPOS); int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; int nGetCP = AddInstruction(RE_GET_CHARPOS); if (nGetCP < 0) return -1; GetInstruction(nGetCP).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex; int nGetSP = AddInstruction(RE_GET_STACKPOS); if (nGetSP < 0) return -1; GetInstruction(nGetSP).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex; int nJmp = AddInstruction(RE_JMP); if (nJmp < 0) return -1; int nSE = ParseSE(ppszRE, bEmpty); if (nSE < 0) return nSE; // patch the call GetInstruction(nCall).call.nTarget = nSE; int nGetCP1 = AddInstruction(RE_GET_CHARPOS); if (nGetCP1 < 0) return -1; GetInstruction(nGetCP1).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex; int nGetSP1 = AddInstruction(RE_GET_STACKPOS); if (nGetSP1 < 0) return -1; GetInstruction(nGetSP1).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex; int nRet = AddInstruction(RE_RETURN); if (nRet < 0) return -1; GetInstruction(nJmp).jmp.nTarget = nRet+1; return nStoreCP; } // ParseAbbrev: parse grammar rule Abbrev int ParseAbbrev(const RECHAR **ppszRE, bool &bEmpty) { const RECHAR **szAbbrevs = CharTraits::GetAbbrevs(); while (*szAbbrevs) { if (**ppszRE == **szAbbrevs) { const RECHAR *szAbbrev = (*szAbbrevs)+1; int p = ParseE(&szAbbrev, bEmpty); if (p < 0) { SetLastParseError(REPARSE_ERROR_UNEXPECTED); return p; } *ppszRE = CharTraits::Next(*ppszRE); return p; } szAbbrevs++; } return -1; } // ParseSE: parse grammar rule SE (simple expression) int ParseSE(const RECHAR **ppszRE, bool &bEmpty) { if (MatchToken(ppszRE, '{')) return ParseArg(ppszRE, bEmpty); if (MatchToken(ppszRE, '(')) return ParseGroup(ppszRE, bEmpty); if (MatchToken(ppszRE, '[')) return ParseCharClass(ppszRE, bEmpty); if (MatchToken(ppszRE, '\\')) { if (!CharTraits::Isdigit(**ppszRE)) { // check for abbreviations int p; p = ParseAbbrev(ppszRE, bEmpty); if (p >= 0) return p; if (GetLastParseError()) return -1; // escaped char p = AddInstruction(RE_SYMBOL); if (p < 0) return -1; GetInstruction(p).symbol.nSymbol = (int) **ppszRE; *ppszRE = CharTraits::Next(*ppszRE); return p; } // previous match bEmpty = false; int nPrev = AddInstruction(RE_PREVIOUS); if (nPrev < 0) return -1; UINT uValue = (UINT) CharTraits::Strtol(*ppszRE, (RECHAR **) ppszRE, 10); if (uValue >= m_uNumGroups) { SetLastParseError(REPARSE_ERROR_INVALID_GROUP); return -1; } GetInstruction(nPrev).prev.nGroup = (size_t) uValue; return nPrev; } if (MatchToken(ppszRE, '!')) return ParseNot(ppszRE, bEmpty); if (**ppszRE == '}' || **ppszRE == ']' || **ppszRE == ')') { return -1; } if (**ppszRE == '\0') { return -1; } int p; if (**ppszRE == '.') { p = AddInstruction(RE_ANY); if (p < 0) return -1; bEmpty = false; } else if (**ppszRE == '$' && (*ppszRE)[1] == '\0') { p = AddInstruction(RE_SYMBOL); if (p < 0) return -1; GetInstruction(p).symbol.nSymbol = 0; bEmpty = false; } else { p = AddInstruction(RE_SYMBOL); if (p < 0) return -1; GetInstruction(p).symbol.nSymbol = (int) **ppszRE; bEmpty = false; } *ppszRE = CharTraits::Next(*ppszRE); return p; } // ParseE: parse grammar rule E (expression) int ParseE(const RECHAR **ppszRE, bool &bEmpty) { CParseState ParseState(this); const RECHAR *sz = *ppszRE; int nSE; int nFirst = ParseSE(ppszRE, bEmpty); if (nFirst < 0) return nFirst; REInstructionType type = RE_MATCH; if (MatchToken(ppszRE, '*')) if(MatchToken(ppszRE, '?')) type = RE_NG_STAR_BEGIN; else type = RE_STAR_BEGIN; else if (MatchToken(ppszRE, '+')) if(MatchToken(ppszRE, '?')) type = RE_NG_PLUS; else type = RE_PLUS; else if (MatchToken(ppszRE, '?')) if(MatchToken(ppszRE, '?')) type = RE_NG_QUESTION; else type = RE_QUESTION; if (type == RE_MATCH) return nFirst; if (type == RE_STAR_BEGIN || type == RE_QUESTION|| type == RE_NG_STAR_BEGIN || type == RE_NG_QUESTION) { ParseState.Restore(this); } else { m_uNumGroups = ParseState.m_uNumGroups; } *ppszRE = sz; int nE; if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS || type == RE_NG_QUESTION) // Non-Greedy { int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; bEmpty = false; nSE = ParseSE(ppszRE, bEmpty); if (nSE < 0) return nSE; if (bEmpty && (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS)) { SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP); return -1; } bEmpty = true; *ppszRE = CharTraits::Next(*ppszRE); *ppszRE = CharTraits::Next(*ppszRE); if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS) { int nJmp = AddInstruction(RE_JMP); if (nJmp < 0) return -1; GetInstruction(nCall).call.nTarget = nJmp+1; GetInstruction(nJmp).jmp.nTarget = nCall; } else GetInstruction(nCall).call.nTarget = nSE+1; if (type == RE_NG_PLUS) nE = nFirst; else nE = nCall; } else // Greedy { int nPushMem = AddInstruction(RE_PUSH_MEMORY); if (nPushMem < 0) return -1; int nStore = AddInstruction(RE_STORE_CHARPOS); if (nStore < 0) return -1; if (AddInstruction(RE_PUSH_CHARPOS) < 0) return -1; int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; if (AddInstruction(RE_POP_CHARPOS) < 0) return -1; int nPopMem = AddInstruction(RE_POP_MEMORY); if (nPopMem < 0) return -1; int nJmp = AddInstruction(RE_JMP); if (nJmp < 0) return -1; GetInstruction(nPushMem).memory.nIndex = m_uRequiredMem++; GetInstruction(nStore).memory.nIndex = GetInstruction(nPushMem).memory.nIndex; GetInstruction(nCall).call.nTarget = nJmp+1; GetInstruction(nPopMem).memory.nIndex = GetInstruction(nPushMem).memory.nIndex; bEmpty = false; nSE = ParseSE(ppszRE, bEmpty); if (nSE < 0) return nSE; if (bEmpty && (type == RE_STAR_BEGIN || type == RE_PLUS)) { SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP); return -1; } if (type != RE_PLUS && type != RE_NG_PLUS) bEmpty = true; *ppszRE = CharTraits::Next(*ppszRE); int nRetNoMatch = AddInstruction(RE_RET_NOMATCH); if (nRetNoMatch < 0) return -1; int nStore1 = AddInstruction(RE_STORE_CHARPOS); if (nStore1 < 0) return -1; GetInstruction(nRetNoMatch).memory.nIndex = GetInstruction(nPushMem).memory.nIndex; GetInstruction(nStore1).memory.nIndex = GetInstruction(nPushMem).memory.nIndex; if (type != RE_QUESTION) { int nJmp1 = AddInstruction(RE_JMP); if (nJmp1 < 0) return -1; GetInstruction(nJmp1).jmp.nTarget = nPushMem; } GetInstruction(nJmp).jmp.nTarget = m_Instructions.GetCount(); if (type == RE_PLUS) nE = nFirst; else nE = nPushMem; } return nE; } // ParseAltE: parse grammar rule AltE int ParseAltE(const RECHAR **ppszRE, bool &bEmpty) { const RECHAR *sz = *ppszRE; CParseState ParseState(this); int nPush = AddInstruction(RE_PUSH_CHARPOS); if (nPush < 0) return -1; int nCall = AddInstruction(RE_CALL); if (nCall < 0) return -1; GetInstruction(nCall).call.nTarget = nPush+4; if (AddInstruction(RE_POP_CHARPOS) < 0) return -1; int nJmpNext = AddInstruction(RE_JMP); if (nJmpNext < 0) return -1; int nE = ParseE(ppszRE, bEmpty); if (nE < 0) { if (GetLastParseError()) return -1; ParseState.Restore(this); return nE; } int nJmpEnd = AddInstruction(RE_JMP); if (nJmpEnd < 0) return -1; GetInstruction(nJmpNext).jmp.nTarget = nJmpEnd+1; if (!MatchToken(ppszRE, '|')) { ParseState.Restore(this); *ppszRE = sz; return ParseE(ppszRE, bEmpty); } bool bEmptyAltE; int nAltE = ParseAltE(ppszRE, bEmptyAltE); GetInstruction(nJmpEnd).jmp.nTarget = m_Instructions.GetCount(); GetInstruction(nJmpNext).jmp.nTarget = nAltE; if (nAltE < 0) { if (GetLastParseError()) return -1; ParseState.Restore(this); return nAltE; } bEmpty = bEmpty | bEmptyAltE; return nPush; } // ParseRE: parse grammar rule RE (regular expression) int ParseRE(const RECHAR **ppszRE, bool &bEmpty) { if (**ppszRE == '\0') return -1; int p = ParseAltE(ppszRE, bEmpty); if (p < 0) return p; bool bEmptyRE = true; ParseRE(ppszRE, bEmptyRE); if (GetLastParseError()) return -1; bEmpty = bEmpty && bEmptyRE; return p; } //pointers to the matched string and matched groups, currently point into an internal allocated //buffer that hold a copy of the input string. //This function fix these pointers to point into the original, user supplied buffer (first param to Match method). //Example: If a ptr (szStart) currently point to +3, it is fixed to +3 void FixupMatchContext(CAtlREMatchContext *pContext, const RECHAR *szOrig, const RECHAR *szNew) { ATLENSURE(pContext); ATLASSERT(szOrig); ATLASSERT(szNew); pContext->m_Match.szStart = szOrig + (pContext->m_Match.szStart - szNew); pContext->m_Match.szEnd = szOrig + (pContext->m_Match.szEnd - szNew); for (UINT i=0; im_uNumGroups; i++) { if (pContext->m_Matches[i].szStart==NULL || pContext->m_Matches[i].szEnd==NULL) { continue; //Do not fix unmatched groups. } pContext->m_Matches[i].szStart = szOrig + (pContext->m_Matches[i].szStart - szNew); pContext->m_Matches[i].szEnd = szOrig + (pContext->m_Matches[i].szEnd - szNew); } } // implementation // helpers for dumping and debugging the rx engine public: #ifdef ATL_REGEXP_DUMP size_t DumpInstruction(size_t ip) { printf("%08x ", ip); switch (GetInstruction(ip).type) { case RE_NOP: printf("NOP\n"); ip++; break; case RE_SYMBOL: AtlprintfT(CAToREChar("Symbol %c\n"),GetInstruction(ip).symbol.nSymbol); ip++; break; case RE_ANY: printf("Any\n"); ip++; break; case RE_RANGE: printf("Range\n"); ip++; ip += InstructionsPerRangeBitField(); break; case RE_NOTRANGE: printf("NOT Range\n"); ip++; ip += InstructionsPerRangeBitField(); break; case RE_RANGE_EX: printf("RangeEx %08x\n", GetInstruction(ip).range.nTarget); ip++; break; case RE_NOTRANGE_EX: printf("NotRangeEx %08x\n", GetInstruction(ip).range.nTarget); ip++; break; case RE_GROUP_START: printf("Start group %d\n", GetInstruction(ip).group.nGroup); ip++; break; case RE_GROUP_END: printf("Group end %d\n", GetInstruction(ip).group.nGroup); ip++; break; case RE_PUSH_CHARPOS: printf("Push char pos\n"); ip++; break; case RE_POP_CHARPOS: printf("Pop char pos\n"); ip++; break; case RE_STORE_CHARPOS: printf("Store char pos %d\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_GET_CHARPOS: printf("Get char pos %d\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_STORE_STACKPOS: printf("Store stack pos %d\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_GET_STACKPOS: printf("Get stack pos %d\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_CALL: printf("Call %08x\n", GetInstruction(ip).call.nTarget); ip++; break; case RE_JMP: printf("Jump %08x\n", GetInstruction(ip).jmp.nTarget); ip++; break; case RE_RETURN: printf("return\n"); ip++; break; case RE_PUSH_MEMORY: printf("Push memory %08x\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_POP_MEMORY: printf("Pop memory %08x\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_RET_NOMATCH: printf("Return no match %08x\n", GetInstruction(ip).memory.nIndex); ip++; break; case RE_MATCH: printf("END\n"); ip++; break; case RE_ADVANCE: printf("ADVANCE\n"); ip++; break; case RE_FAIL: printf("FAIL\n"); ip++; break; case RE_PREVIOUS: printf("Prev %d\n", GetInstruction(ip).prev.nGroup); ip++; break; case RE_PUSH_GROUP: printf("Push group %d\n", GetInstruction(ip).group.nGroup); ip++; break; case RE_POP_GROUP: printf("Pop group %d\n", GetInstruction(ip).group.nGroup); ip++; break; default: printf("????\n"); ip++; break; } return ip; } void Dump(size_t ipCurrent = 0) { size_t ip = 0; while (ip < m_Instructions.GetCount()) { if (ip == ipCurrent) printf("->"); ip = DumpInstruction(ip); } } #endif #ifdef ATLRX_DEBUG void cls( HANDLE hConsole ) { COORD coordScreen = { 0, 0 }; /* here's where we'll home the cursor */ BOOL bSuccess; DWORD cCharsWritten; CONSOLE_SCREEN_BUFFER_INFO csbi; /* to get buffer info */ DWORD dwConSize; /* number of character cells in the current buffer */ /* get the number of character cells in the current buffer */ bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi ); dwConSize = csbi.dwSize.X * csbi.dwSize.Y; /* fill the entire screen with blanks */ bSuccess = FillConsoleOutputCharacter( hConsole, (TCHAR) ' ', dwConSize, coordScreen, &cCharsWritten ); /* get the current text attribute */ bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi ); /* now set the buffer's attributes accordingly */ bSuccess = FillConsoleOutputAttribute( hConsole, csbi.wAttributes, dwConSize, coordScreen, &cCharsWritten ); /* put the cursor at (0, 0) */ bSuccess = SetConsoleCursorPosition( hConsole, coordScreen ); return; } void DumpStack(CAtlREMatchContext *pContext) { for (size_t i=pContext->m_nTos; i>0; i--) { if (pContext->m_stack[i] < (void *) m_Instructions.GetCount()) printf("0x%p\n", pContext->m_stack[i]); else { // assume a pointer into the input AtlprintfT(CAToREChar("%s\n"), pContext->m_stack[i]); } } } void DumpMemory(CAtlREMatchContext *pContext) { for (UINT i=0; i(CAToREChar("%d: %s\n"), i, pContext->m_Mem.m_p[i]); } } virtual void OnDebugEvent(size_t ip, const RECHAR *szIn, const RECHAR *sz, CAtlREMatchContext *pContext) { cls(GetStdHandle(STD_OUTPUT_HANDLE)); printf("----------Code---------\n"); Dump(ip); printf("----------Input---------\n"); AtlprintfT(CAToREChar("%s\n"), szIn); for (int s=0; szIn+s < sz; s++) { printf(" "); } printf("^\n"); printf("----------Memory---------\n"); DumpMemory(pContext); printf("----------Stack---------\n"); DumpStack(pContext); getchar(); } #endif }; } // namespace ATL #pragma pack(pop) #endif // __ATLRX_H__