atlrx.h 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013
  1. // This is a part of the Active Template Library.
  2. // Copyright (C) Microsoft Corporation
  3. // All rights reserved.
  4. //
  5. // This source code is only intended as a supplement to the
  6. // Active Template Library Reference and related
  7. // electronic documentation provided with the library.
  8. // See these sources for detailed information regarding the
  9. // Active Template Library product.
  10. #ifndef __ATLRX_H__
  11. #define __ATLRX_H__
  12. #pragma once
  13. #include <atlbase.h>
  14. #include <atlcoll.h>
  15. #include <mbstring.h>
  16. #ifndef ATL_REGEXP_MIN_STACK
  17. #define ATL_REGEXP_MIN_STACK 256
  18. #endif
  19. /*
  20. Regular Expression Grammar
  21. R - top level grammar rule
  22. RE - regular expression
  23. AltE - Alternative expression
  24. E - expression
  25. SE - simple expression
  26. R -> RE
  27. '^'RE (matches begining of string)
  28. RE -> AltE RE
  29. AltE
  30. AltE -> E
  31. E '|' AltE
  32. E -> SE (RepeatOp '?'?)?
  33. SE -> Arg
  34. Group
  35. CharClass
  36. '\'Abbrev (see below)
  37. '\'EscapedChar (any character including reserved symbols)
  38. '\'Digit+ (Arg back reference)
  39. '!' (not)
  40. '.' (any char)
  41. '$' (end of input)
  42. Symbol (any non-reserved character)
  43. Arg -> '{'RE'}'
  44. Group -> '('RE')'
  45. CharClass -> '[' '^'? CharSet ']'
  46. CharSet -> CharItem+
  47. CharItem -> Char('-'Char)?
  48. RepeatOp -> '*'
  49. '+'
  50. '?'
  51. Abbrev -> Abbreviation defined in CAtlRECharTraits
  52. Abbrev Expansion Meaning
  53. a ([a-zA-Z0-9]) alpha numeric
  54. b ([ \\t]) white space (blank)
  55. c ([a-zA-Z]) alpha
  56. d ([0-9]) digit
  57. h ([0-9a-fA-F]) hex digit
  58. n (\r|(\r?\n)) newline
  59. q (\"[^\"]*\")|(\'[^\']*\') quoted string
  60. w ([a-zA-Z]+) simple word
  61. z ([0-9]+) integer
  62. */
  63. #pragma pack(push,_ATL_PACKING)
  64. namespace ATL {
  65. //Convertion utility classes used to convert char* to RECHAR.
  66. //Used by rx debugging printing.
  67. template <typename RECHARTYPE=char>
  68. class CAToREChar
  69. {
  70. public:
  71. CAToREChar(const char* psz) throw()
  72. : m_psz(psz)
  73. {
  74. }
  75. operator const RECHARTYPE*() const throw() { return m_psz; }
  76. const char* m_psz;
  77. };
  78. template<>
  79. class CAToREChar<wchar_t>
  80. {
  81. public:
  82. CAToREChar(const char* psz) throw()
  83. : m_a2w(psz)
  84. {
  85. }
  86. operator const wchar_t*() const throw() { return (wchar_t*)m_a2w; }
  87. private:
  88. CA2W m_a2w;
  89. };
  90. class CAtlRECharTraitsA
  91. {
  92. public:
  93. typedef char RECHARTYPE;
  94. static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
  95. {
  96. #ifndef ATL_NO_CHECK_BIT_FIELD
  97. ATLASSERT(UseBitFieldForRange());
  98. #endif
  99. return static_cast<size_t>(static_cast<unsigned char>(*sz));
  100. }
  101. static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
  102. {
  103. return (RECHARTYPE *) (sz+1);
  104. }
  105. static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  106. {
  107. return strncmp(szLeft, szRight, nCount);
  108. }
  109. static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  110. {
  111. return _strnicmp(szLeft, szRight, nCount);
  112. }
  113. _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsA::Strlwr must be passed a buffer size.")
  114. static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
  115. {
  116. #pragma warning (push)
  117. #pragma warning(disable : 4996)
  118. return _strlwr(sz);
  119. #pragma warning (pop)
  120. }
  121. static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
  122. {
  123. Checked::strlwr_s(sz, nSize);
  124. return sz;
  125. }
  126. static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
  127. {
  128. return strtol(sz, szEnd, nBase);
  129. }
  130. static int Isdigit(RECHARTYPE ch) throw()
  131. {
  132. return isdigit(static_cast<unsigned char>(ch));
  133. }
  134. static const RECHARTYPE** GetAbbrevs()
  135. {
  136. static const RECHARTYPE *s_szAbbrevs[] =
  137. {
  138. "a([a-zA-Z0-9])", // alpha numeric
  139. "b([ \\t])", // white space (blank)
  140. "c([a-zA-Z])", // alpha
  141. "d([0-9])", // digit
  142. "h([0-9a-fA-F])", // hex digit
  143. "n(\r|(\r?\n))", // newline
  144. "q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
  145. "w([a-zA-Z]+)", // simple word
  146. "z([0-9]+)", // integer
  147. NULL
  148. };
  149. return s_szAbbrevs;
  150. }
  151. static BOOL UseBitFieldForRange() throw()
  152. {
  153. return TRUE;
  154. }
  155. static int ByteLen(const RECHARTYPE *sz) throw()
  156. {
  157. return int(strlen(sz));
  158. }
  159. };
  160. class CAtlRECharTraitsW
  161. {
  162. public:
  163. typedef WCHAR RECHARTYPE;
  164. static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
  165. {
  166. #ifndef ATL_NO_CHECK_BIT_FIELD
  167. ATLASSERT(UseBitFieldForRange());
  168. #endif
  169. return static_cast<size_t>(*sz);
  170. }
  171. static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
  172. {
  173. return (RECHARTYPE *) (sz+1);
  174. }
  175. static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  176. {
  177. return wcsncmp(szLeft, szRight, nCount);
  178. }
  179. static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  180. {
  181. return _wcsnicmp(szLeft, szRight, nCount);
  182. }
  183. _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsW::Strlwr must be passed a buffer size.")
  184. static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
  185. {
  186. #pragma warning (push)
  187. #pragma warning(disable : 4996)
  188. return _wcslwr(sz);
  189. #pragma warning (pop)
  190. }
  191. static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
  192. {
  193. Checked::wcslwr_s(sz, nSize);
  194. return sz;
  195. }
  196. static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
  197. {
  198. return wcstol(sz, szEnd, nBase);
  199. }
  200. static int Isdigit(RECHARTYPE ch) throw()
  201. {
  202. return iswdigit(ch);
  203. }
  204. static const RECHARTYPE** GetAbbrevs()
  205. {
  206. static const RECHARTYPE *s_szAbbrevs[] =
  207. {
  208. L"a([a-zA-Z0-9])", // alpha numeric
  209. L"b([ \\t])", // white space (blank)
  210. L"c([a-zA-Z])", // alpha
  211. L"d([0-9])", // digit
  212. L"h([0-9a-fA-F])", // hex digit
  213. L"n(\r|(\r?\n))", // newline
  214. L"q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
  215. L"w([a-zA-Z]+)", // simple word
  216. L"z([0-9]+)", // integer
  217. NULL
  218. };
  219. return s_szAbbrevs;
  220. }
  221. static BOOL UseBitFieldForRange() throw()
  222. {
  223. return FALSE;
  224. }
  225. static int ByteLen(const RECHARTYPE *sz) throw()
  226. {
  227. return int(wcslen(sz)*sizeof(WCHAR));
  228. }
  229. };
  230. class CAtlRECharTraitsMB
  231. {
  232. public:
  233. typedef unsigned char RECHARTYPE;
  234. static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
  235. {
  236. #ifndef ATL_NO_CHECK_BIT_FIELD
  237. ATLASSERT(UseBitFieldForRange());
  238. #endif
  239. return static_cast<size_t>(*sz);
  240. }
  241. static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
  242. {
  243. return _mbsinc(sz);
  244. }
  245. static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  246. {
  247. return _mbsncmp(szLeft, szRight, nCount);
  248. }
  249. static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
  250. {
  251. return _mbsnicmp(szLeft, szRight, nCount);
  252. }
  253. _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsMB::Strlwr must be passed a buffer size.")
  254. static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
  255. {
  256. #pragma warning (push)
  257. #pragma warning(disable : 4996)
  258. return _mbslwr(sz);
  259. #pragma warning (pop)
  260. }
  261. static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
  262. {
  263. Checked::mbslwr_s(sz, nSize);
  264. return sz;
  265. }
  266. static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
  267. {
  268. return strtol((const char *) sz, (char **) szEnd, nBase);
  269. }
  270. static int Isdigit(RECHARTYPE ch) throw()
  271. {
  272. return _ismbcdigit((unsigned int) ch);
  273. }
  274. static const RECHARTYPE** GetAbbrevs()
  275. {
  276. return reinterpret_cast<const RECHARTYPE **>(CAtlRECharTraitsA::GetAbbrevs());
  277. }
  278. static BOOL UseBitFieldForRange() throw()
  279. {
  280. return FALSE;
  281. }
  282. static int ByteLen(const RECHARTYPE *sz) throw()
  283. {
  284. return (int)strlen((const char *) sz);
  285. }
  286. };
  287. #ifndef _UNICODE
  288. typedef CAtlRECharTraitsA CAtlRECharTraits;
  289. #else // _UNICODE
  290. typedef CAtlRECharTraitsW CAtlRECharTraits;
  291. #endif // !_UNICODE
  292. // Note: If you want to use CAtlRECharTraitsMB you must pass it in
  293. // as a template argument
  294. template <class CharTraits=CAtlRECharTraits>
  295. class CAtlRegExp; // forward declaration
  296. template <class CharTraits=CAtlRECharTraits>
  297. class CAtlREMatchContext
  298. {
  299. public:
  300. friend CAtlRegExp<CharTraits>;
  301. typedef typename CharTraits::RECHARTYPE RECHAR;
  302. struct MatchGroup
  303. {
  304. const RECHAR *szStart;
  305. const RECHAR *szEnd;
  306. };
  307. UINT m_uNumGroups;
  308. MatchGroup m_Match;
  309. void GetMatch(UINT nIndex, const RECHAR **szStart, const RECHAR **szEnd)
  310. {
  311. ATLENSURE(szStart != NULL);
  312. ATLENSURE(szEnd != NULL);
  313. ATLENSURE(nIndex >=0 && nIndex < m_uNumGroups);
  314. *szStart = m_Matches[nIndex].szStart;
  315. *szEnd = m_Matches[nIndex].szEnd;
  316. }
  317. void GetMatch(UINT nIndex, MatchGroup *pGroup)
  318. {
  319. ATLENSURE(pGroup != NULL);
  320. ATLENSURE(nIndex >=0&&(static_cast<UINT>(nIndex))< m_uNumGroups);
  321. pGroup->szStart = m_Matches[nIndex].szStart;
  322. pGroup->szEnd = m_Matches[nIndex].szEnd;
  323. }
  324. protected:
  325. CAutoVectorPtr<void *> m_Mem;
  326. CAutoVectorPtr<MatchGroup> m_Matches;
  327. CAtlArray<void *> m_stack;
  328. size_t m_nTos;
  329. public:
  330. CAtlREMatchContext(size_t nInitStackSize=ATL_REGEXP_MIN_STACK)
  331. {
  332. m_uNumGroups = 0;
  333. m_nTos = 0;
  334. m_stack.SetCount(nInitStackSize);
  335. m_Match.szStart = NULL;
  336. m_Match.szEnd = NULL;
  337. }
  338. protected:
  339. BOOL Initialize(UINT uRequiredMem, UINT uNumGroups) throw()
  340. {
  341. m_nTos = 0;
  342. m_uNumGroups = 0;
  343. m_Matches.Free();
  344. if (!m_Matches.Allocate(uNumGroups))
  345. return FALSE;
  346. m_uNumGroups = uNumGroups;
  347. m_Mem.Free();
  348. if (!m_Mem.Allocate(uRequiredMem))
  349. return FALSE;
  350. memset(m_Mem.m_p, 0x00, uRequiredMem*sizeof(void *));
  351. memset(m_Matches, 0x00, m_uNumGroups * sizeof(MatchGroup));
  352. return TRUE;
  353. }
  354. BOOL Push(void *p)
  355. {
  356. m_nTos++;
  357. if (m_stack.GetCount() <= (UINT) m_nTos)
  358. {
  359. if (!m_stack.SetCount((m_nTos+1)*2))
  360. {
  361. m_nTos--;
  362. return FALSE;
  363. }
  364. }
  365. m_stack[m_nTos] = p;
  366. return TRUE;
  367. }
  368. BOOL Push(size_t n)
  369. {
  370. return Push((void *) n);
  371. }
  372. void *Pop() throw()
  373. {
  374. if (m_nTos==0)
  375. {
  376. // stack underflow
  377. // this should never happen at match time.
  378. // (the parsing succeeded when it shouldn't have)
  379. ATLASSERT(FALSE);
  380. return NULL;
  381. }
  382. void *p = m_stack[m_nTos];
  383. m_nTos--;
  384. return p;
  385. }
  386. };
  387. enum REParseError {
  388. REPARSE_ERROR_OK = 0, // No error occurred
  389. REPARSE_ERROR_OUTOFMEMORY, // Out of memory
  390. REPARSE_ERROR_BRACE_EXPECTED, // A closing brace was expected
  391. REPARSE_ERROR_PAREN_EXPECTED, // A closing parenthesis was expected
  392. REPARSE_ERROR_BRACKET_EXPECTED, // A closing bracket was expected
  393. REPARSE_ERROR_UNEXPECTED, // An unspecified fatal error occurred
  394. REPARSE_ERROR_EMPTY_RANGE, // A range expression was empty
  395. REPARSE_ERROR_INVALID_GROUP, // A backreference was made to a group
  396. // that did not exist
  397. REPARSE_ERROR_INVALID_RANGE, // An invalid range was specified
  398. REPARSE_ERROR_EMPTY_REPEATOP, // A possibly empty * or + was detected
  399. REPARSE_ERROR_INVALID_INPUT, // The input string was invalid
  400. };
  401. template <class CharTraits /* =CAtlRECharTraits */>
  402. class CAtlRegExp
  403. {
  404. public:
  405. CAtlRegExp() throw()
  406. {
  407. m_uNumGroups = 0;
  408. m_uRequiredMem = 0;
  409. m_bCaseSensitive = TRUE;
  410. m_LastError = REPARSE_ERROR_OK;
  411. }
  412. typedef typename CharTraits::RECHARTYPE RECHAR;
  413. // CAtlRegExp::Parse
  414. // Parses the regular expression
  415. // returns REPARSE_ERROR_OK if successful, an REParseError otherwise
  416. REParseError Parse(const RECHAR *szRE, BOOL bCaseSensitive=TRUE)
  417. {
  418. ATLASSERT(szRE);
  419. if (!szRE)
  420. return REPARSE_ERROR_INVALID_INPUT;
  421. Reset();
  422. m_bCaseSensitive = bCaseSensitive;
  423. const RECHAR *szInput = szRE;
  424. if (!bCaseSensitive)
  425. {
  426. // copy the string
  427. int nSize = CharTraits::ByteLen(szRE)+sizeof(RECHAR);
  428. szInput = (const RECHAR *) malloc(nSize);
  429. if (!szInput)
  430. return REPARSE_ERROR_OUTOFMEMORY;
  431. Checked::memcpy_s((char *) szInput, nSize, szRE, nSize);
  432. CharTraits::Strlwr(const_cast<RECHAR *>(szInput), nSize/sizeof(RECHAR));
  433. }
  434. const RECHAR *sz = szInput;
  435. int nCall = AddInstruction(RE_CALL);
  436. if (nCall < 0)
  437. return REPARSE_ERROR_OUTOFMEMORY;
  438. if (*sz == '^')
  439. {
  440. if (AddInstruction(RE_FAIL) < 0)
  441. return REPARSE_ERROR_OUTOFMEMORY;
  442. sz++;
  443. }
  444. else
  445. {
  446. if (AddInstruction(RE_ADVANCE) < 0)
  447. return REPARSE_ERROR_OUTOFMEMORY;
  448. }
  449. bool bEmpty = true;
  450. ParseRE(&sz, bEmpty);
  451. if (!GetLastParseError())
  452. {
  453. GetInstruction(nCall).call.nTarget = 2;
  454. if (AddInstruction(RE_MATCH) < 0)
  455. return REPARSE_ERROR_OUTOFMEMORY;
  456. }
  457. if (szInput != szRE)
  458. free((void *) szInput);
  459. return GetLastParseError();
  460. }
  461. BOOL Match(const RECHAR *szIn, CAtlREMatchContext<CharTraits> *pContext, const RECHAR **ppszEnd=NULL)
  462. {
  463. ATLASSERT(szIn);
  464. ATLASSERT(pContext);
  465. if (!szIn || !pContext)
  466. return FALSE;
  467. if (ppszEnd)
  468. *ppszEnd = NULL;
  469. const RECHAR *szInput = szIn;
  470. if (!m_bCaseSensitive)
  471. {
  472. int nSize = CharTraits::ByteLen(szIn)+sizeof(RECHAR);
  473. szInput = (const RECHAR *) malloc(nSize);
  474. if (!szInput)
  475. return FALSE;
  476. Checked::memcpy_s((char *) szInput, nSize, szIn, nSize);
  477. CharTraits::Strlwr(const_cast<RECHAR *>(szInput), nSize/sizeof(RECHAR));
  478. }
  479. if (!pContext->Initialize(m_uRequiredMem, m_uNumGroups))
  480. {
  481. if (szInput != szIn)
  482. free((void *) szInput);
  483. return FALSE;
  484. }
  485. size_t ip = 0;
  486. const RECHAR *sz = szInput;
  487. const RECHAR *szCurrInput = szInput;
  488. #pragma warning(push)
  489. #pragma warning(disable:4127) // conditional expression is constant
  490. while (1)
  491. {
  492. #ifdef ATLRX_DEBUG
  493. OnDebugEvent(ip, szInput, sz, pContext);
  494. #endif
  495. if (ip == 0)
  496. pContext->m_Match.szStart = sz;
  497. switch (GetInstruction(ip).type)
  498. {
  499. case RE_NOP:
  500. ip++;
  501. break;
  502. case RE_SYMBOL:
  503. if (GetInstruction(ip).symbol.nSymbol == static_cast<size_t>(*sz))
  504. {
  505. sz = CharTraits::Next(sz);
  506. ip++;
  507. }
  508. else
  509. {
  510. ip = (size_t) pContext->Pop();
  511. }
  512. break;
  513. case RE_ANY:
  514. if (*sz)
  515. {
  516. sz = CharTraits::Next(sz);
  517. ip++;
  518. }
  519. else
  520. {
  521. ip = (size_t) pContext->Pop();
  522. }
  523. break;
  524. case RE_GROUP_START:
  525. pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = sz;
  526. ip++;
  527. break;
  528. case RE_GROUP_END:
  529. pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = sz;
  530. ip++;
  531. break;
  532. case RE_PUSH_CHARPOS:
  533. pContext->Push((void *) sz);
  534. ip++;
  535. break;
  536. case RE_POP_CHARPOS:
  537. sz = (RECHAR *) pContext->Pop();
  538. ip++;
  539. break;
  540. case RE_CALL:
  541. pContext->Push(ip+1);
  542. ip = GetInstruction(ip).call.nTarget;
  543. break;
  544. case RE_JMP:
  545. ip = GetInstruction(ip).jmp.nTarget;
  546. break;
  547. case RE_RETURN:
  548. ip = (size_t) pContext->Pop();
  549. break;
  550. case RE_PUSH_MEMORY:
  551. pContext->Push((void *) (pContext->m_Mem[GetInstruction(ip).memory.nIndex]));
  552. ip++;
  553. break;
  554. case RE_POP_MEMORY:
  555. pContext->m_Mem[GetInstruction(ip).memory.nIndex] = pContext->Pop();
  556. ip++;
  557. break;
  558. case RE_STORE_CHARPOS:
  559. pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) sz;
  560. ip++;
  561. break;
  562. case RE_GET_CHARPOS:
  563. sz = (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex];
  564. ip++;
  565. break;
  566. case RE_STORE_STACKPOS:
  567. pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) pContext->m_nTos;
  568. ip++;
  569. break;
  570. case RE_GET_STACKPOS:
  571. pContext->m_nTos = (size_t) pContext->m_Mem[GetInstruction(ip).memory.nIndex];
  572. ip++;
  573. break;
  574. case RE_RET_NOMATCH:
  575. if (sz == (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex])
  576. {
  577. // do a return
  578. ip = (size_t) pContext->Pop();
  579. }
  580. else
  581. ip++;
  582. break;
  583. case RE_ADVANCE:
  584. sz = CharTraits::Next(szCurrInput);
  585. szCurrInput = sz;
  586. if (*sz == '\0')
  587. goto Error;
  588. ip = 0;
  589. pContext->m_nTos = 0;
  590. break;
  591. case RE_FAIL:
  592. goto Error;
  593. case RE_RANGE:
  594. {
  595. if (*sz == '\0')
  596. {
  597. ip = (size_t) pContext->Pop();
  598. break;
  599. }
  600. RECHAR *pBits = reinterpret_cast<RECHAR *>((&m_Instructions[ip]+1));
  601. size_t u = CharTraits::GetBitFieldForRangeArrayIndex(sz);
  602. if (pBits[u >> 3] & 1 << (u & 0x7))
  603. {
  604. ip += InstructionsPerRangeBitField();
  605. ip++;
  606. sz = CharTraits::Next(sz);
  607. }
  608. else
  609. {
  610. ip = (size_t) pContext->Pop();
  611. }
  612. }
  613. break;
  614. case RE_NOTRANGE:
  615. {
  616. if (*sz == '\0')
  617. {
  618. ip = (size_t) pContext->Pop();
  619. break;
  620. }
  621. RECHAR *pBits = reinterpret_cast<RECHAR *>((&m_Instructions[ip]+1));
  622. size_t u = static_cast<size_t>(* ((RECHAR *) sz));
  623. if (pBits[u >> 3] & 1 << (u & 0x7))
  624. {
  625. ip = (size_t) pContext->Pop();
  626. }
  627. else
  628. {
  629. ip += InstructionsPerRangeBitField();
  630. ip++;
  631. sz = CharTraits::Next(sz);
  632. }
  633. }
  634. break;
  635. case RE_RANGE_EX:
  636. {
  637. if (*sz == '\0')
  638. {
  639. ip = (size_t) pContext->Pop();
  640. break;
  641. }
  642. BOOL bMatch = FALSE;
  643. size_t inEnd = GetInstruction(ip).range.nTarget;
  644. ip++;
  645. while (ip < inEnd)
  646. {
  647. if (static_cast<size_t>(*sz) >= GetInstruction(ip).memory.nIndex &&
  648. static_cast<size_t>(*sz) <= GetInstruction(ip+1).memory.nIndex)
  649. {
  650. // if we match, we jump to the end
  651. sz = CharTraits::Next(sz);
  652. ip = inEnd;
  653. bMatch = TRUE;
  654. }
  655. else
  656. {
  657. ip += 2;
  658. }
  659. }
  660. if (!bMatch)
  661. {
  662. ip = (size_t) pContext->Pop();
  663. }
  664. }
  665. break;
  666. case RE_NOTRANGE_EX:
  667. {
  668. if (*sz == '\0')
  669. {
  670. ip = (size_t) pContext->Pop();
  671. break;
  672. }
  673. BOOL bMatch = TRUE;
  674. size_t inEnd = GetInstruction(ip).range.nTarget;
  675. ip++;
  676. while (ip < inEnd)
  677. {
  678. if (static_cast<size_t>(*sz) >= GetInstruction(ip).memory.nIndex &&
  679. static_cast<size_t>(*sz) <= GetInstruction(ip+1).memory.nIndex)
  680. {
  681. ip = (size_t) pContext->Pop();
  682. bMatch = FALSE;
  683. break;
  684. }
  685. else
  686. {
  687. // if we match, we jump to the end
  688. ip += 2;
  689. }
  690. }
  691. if (bMatch)
  692. sz = CharTraits::Next(sz);
  693. }
  694. break;
  695. case RE_PREVIOUS:
  696. {
  697. BOOL bMatch = FALSE;
  698. if (m_bCaseSensitive)
  699. {
  700. bMatch = !CharTraits::Strncmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart,
  701. pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart);
  702. }
  703. else
  704. {
  705. bMatch = !CharTraits::Strnicmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart,
  706. pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart);
  707. }
  708. if (bMatch)
  709. {
  710. sz += pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart;
  711. ip++;
  712. break;
  713. }
  714. ip = (size_t) pContext->Pop();
  715. }
  716. break;
  717. case RE_MATCH:
  718. pContext->m_Match.szEnd = sz;
  719. if (!m_bCaseSensitive)
  720. FixupMatchContext(pContext, szIn, szInput);
  721. if (ppszEnd)
  722. *ppszEnd = szIn + (sz - szInput);
  723. if (szInput != szIn)
  724. free((void *) szInput);
  725. return TRUE;
  726. break;
  727. case RE_PUSH_GROUP:
  728. pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart);
  729. pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd);
  730. ip++;
  731. break;
  732. case RE_POP_GROUP:
  733. pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = (const RECHAR *) pContext->Pop();
  734. pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = (const RECHAR *) pContext->Pop();
  735. ip++;
  736. break;
  737. default:
  738. ATLASSERT(FALSE);
  739. break;
  740. }
  741. }
  742. #pragma warning(pop) // 4127
  743. ATLASSERT(FALSE);
  744. Error:
  745. pContext->m_Match.szEnd = sz;
  746. if (!m_bCaseSensitive)
  747. FixupMatchContext(pContext, szIn, szInput);
  748. if (ppszEnd)
  749. *ppszEnd = szIn + (sz - szInput);
  750. if (szInput != szIn)
  751. free((void *) szInput);
  752. return FALSE;
  753. }
  754. protected:
  755. REParseError m_LastError;
  756. REParseError GetLastParseError() throw()
  757. {
  758. return m_LastError;
  759. }
  760. void SetLastParseError(REParseError Error) throw()
  761. {
  762. m_LastError = Error;
  763. }
  764. // CAtlRegExp::Reset
  765. // Removes all instructions to allow reparsing into the same instance
  766. void Reset() throw()
  767. {
  768. m_Instructions.RemoveAll();
  769. m_uRequiredMem = 0;
  770. m_bCaseSensitive = TRUE;
  771. m_uNumGroups = 0;
  772. SetLastParseError(REPARSE_ERROR_OK);
  773. }
  774. enum REInstructionType {
  775. RE_NOP,
  776. RE_GROUP_START,
  777. RE_GROUP_END,
  778. RE_SYMBOL,
  779. RE_ANY,
  780. RE_RANGE,
  781. RE_NOTRANGE,
  782. RE_RANGE_EX,
  783. RE_NOTRANGE_EX,
  784. RE_PLUS,
  785. RE_NG_PLUS,
  786. RE_QUESTION,
  787. RE_NG_QUESTION,
  788. RE_JMP,
  789. RE_PUSH_CHARPOS,
  790. RE_POP_CHARPOS,
  791. RE_CALL,
  792. RE_RETURN,
  793. RE_STAR_BEGIN,
  794. RE_NG_STAR_BEGIN,
  795. RE_PUSH_MEMORY,
  796. RE_POP_MEMORY,
  797. RE_STORE_CHARPOS,
  798. RE_STORE_STACKPOS,
  799. RE_GET_CHARPOS,
  800. RE_GET_STACKPOS,
  801. RE_RET_NOMATCH,
  802. RE_PREVIOUS,
  803. RE_FAIL,
  804. RE_ADVANCE,
  805. RE_MATCH,
  806. RE_PUSH_GROUP,
  807. RE_POP_GROUP,
  808. };
  809. struct INSTRUCTION_SYMBOL
  810. {
  811. size_t nSymbol;
  812. };
  813. struct INSTRUCTION_JMP
  814. {
  815. size_t nTarget;
  816. };
  817. struct INSTRUCTION_GROUP
  818. {
  819. size_t nGroup;
  820. };
  821. struct INSTRUCTION_CALL
  822. {
  823. size_t nTarget;
  824. };
  825. struct INSTRUCTION_MEMORY
  826. {
  827. size_t nIndex;
  828. };
  829. struct INSTRUCTION_PREVIOUS
  830. {
  831. size_t nGroup;
  832. };
  833. struct INSTRUCTION_RANGE_EX
  834. {
  835. size_t nTarget;
  836. };
  837. struct INSTRUCTION
  838. {
  839. REInstructionType type;
  840. union
  841. {
  842. INSTRUCTION_SYMBOL symbol;
  843. INSTRUCTION_JMP jmp;
  844. INSTRUCTION_GROUP group;
  845. INSTRUCTION_CALL call;
  846. INSTRUCTION_MEMORY memory;
  847. INSTRUCTION_PREVIOUS prev;
  848. INSTRUCTION_RANGE_EX range;
  849. };
  850. };
  851. inline int InstructionsPerRangeBitField() throw()
  852. {
  853. return (256/8) / sizeof(INSTRUCTION) + (((256/8) % sizeof(INSTRUCTION)) ? 1 : 0);
  854. }
  855. CAtlArray<INSTRUCTION> m_Instructions;
  856. UINT m_uNumGroups;
  857. UINT m_uRequiredMem;
  858. BOOL m_bCaseSensitive;
  859. // class used internally to restore
  860. // parsing state when unwinding
  861. class CParseState
  862. {
  863. public:
  864. int m_nNumInstructions;
  865. UINT m_uNumGroups;
  866. UINT m_uRequiredMem;
  867. CParseState(CAtlRegExp *pRegExp) throw()
  868. {
  869. m_nNumInstructions = (int) pRegExp->m_Instructions.GetCount();
  870. m_uNumGroups = pRegExp->m_uNumGroups;
  871. m_uRequiredMem = pRegExp->m_uRequiredMem;
  872. }
  873. void Restore(CAtlRegExp *pRegExp)
  874. {
  875. pRegExp->m_Instructions.SetCount(m_nNumInstructions);
  876. pRegExp->m_uNumGroups = m_uNumGroups;
  877. pRegExp->m_uRequiredMem = m_uRequiredMem;
  878. }
  879. };
  880. int AddInstruction(REInstructionType type)
  881. {
  882. if (!m_Instructions.SetCount(m_Instructions.GetCount()+1))
  883. {
  884. SetLastParseError(REPARSE_ERROR_OUTOFMEMORY);
  885. return -1;
  886. }
  887. m_Instructions[m_Instructions.GetCount()-1].type = type;
  888. return (int) m_Instructions.GetCount()-1;
  889. }
  890. BOOL PeekToken(const RECHAR **ppszRE, int ch) throw()
  891. {
  892. if (**ppszRE != ch)
  893. return FALSE;
  894. return TRUE;
  895. }
  896. BOOL MatchToken(const RECHAR **ppszRE, int ch) throw()
  897. {
  898. if (!PeekToken(ppszRE, ch))
  899. return FALSE;
  900. *ppszRE = CharTraits::Next(*ppszRE);
  901. return TRUE;
  902. }
  903. INSTRUCTION &GetInstruction(size_t nIndex) throw()
  904. {
  905. return m_Instructions[nIndex];
  906. }
  907. // ParseArg: parse grammar rule Arg
  908. int ParseArg(const RECHAR **ppszRE, bool &bEmpty)
  909. {
  910. int nPushGroup = AddInstruction(RE_PUSH_GROUP);
  911. if (nPushGroup < 0)
  912. return -1;
  913. GetInstruction(nPushGroup).group.nGroup = m_uNumGroups;
  914. int p = AddInstruction(RE_GROUP_START);
  915. if (p < 0)
  916. return -1;
  917. GetInstruction(p).group.nGroup = m_uNumGroups++;
  918. int nCall = AddInstruction(RE_CALL);
  919. if (nCall < 0)
  920. return -1;
  921. int nPopGroup = AddInstruction(RE_POP_GROUP);
  922. if (nPopGroup < 0)
  923. return -1;
  924. GetInstruction(nPopGroup).group.nGroup = GetInstruction(nPushGroup).group.nGroup;
  925. if (AddInstruction(RE_RETURN) < 0)
  926. return -1;
  927. int nAlt = ParseRE(ppszRE, bEmpty);
  928. if (nAlt < 0)
  929. {
  930. if (GetLastParseError())
  931. return -1;
  932. if (!PeekToken(ppszRE, '}'))
  933. {
  934. SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED);
  935. return -1;
  936. }
  937. // in the case of an empty group, we add a nop
  938. nAlt = AddInstruction(RE_NOP);
  939. if (nAlt < 0)
  940. return -1;
  941. }
  942. GetInstruction(nCall).call.nTarget = nAlt;
  943. if (!MatchToken(ppszRE, '}'))
  944. {
  945. SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED);
  946. return -1;
  947. }
  948. int nEnd = AddInstruction(RE_GROUP_END);
  949. if (nEnd < 0)
  950. return -1;
  951. GetInstruction(nEnd).group.nGroup = GetInstruction(p).group.nGroup;
  952. return nPushGroup;
  953. }
  954. // ParseGroup: parse grammar rule Group
  955. int ParseGroup(const RECHAR **ppszRE, bool &bEmpty)
  956. {
  957. int nCall = AddInstruction(RE_CALL);
  958. if (nCall < 0)
  959. return -1;
  960. if (AddInstruction(RE_RETURN) < 0)
  961. return -1;
  962. int nAlt = ParseRE(ppszRE, bEmpty);
  963. if (nAlt < 0)
  964. {
  965. if (GetLastParseError())
  966. return -1;
  967. if (!PeekToken(ppszRE, ')'))
  968. {
  969. SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED);
  970. return -1;
  971. }
  972. // in the case of an empty group, we add a nop
  973. nAlt = AddInstruction(RE_NOP);
  974. if (nAlt < 0)
  975. return -1;
  976. }
  977. GetInstruction(nCall).call.nTarget = nAlt;
  978. if (!MatchToken(ppszRE, ')'))
  979. {
  980. SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED);
  981. return -1;
  982. }
  983. return nCall;
  984. }
  985. RECHAR GetEscapedChar(RECHAR ch) throw()
  986. {
  987. if (ch == 't')
  988. return '\t';
  989. return ch;
  990. }
  991. // ParseCharItem: parse grammar rule CharItem
  992. int ParseCharItem(const RECHAR **ppszRE, RECHAR *pchStartChar, RECHAR *pchEndChar) throw()
  993. {
  994. if (**ppszRE == '\\')
  995. {
  996. *ppszRE = CharTraits::Next(*ppszRE);
  997. *pchStartChar = GetEscapedChar(**ppszRE);
  998. }
  999. else
  1000. *pchStartChar = **ppszRE;
  1001. *ppszRE = CharTraits::Next(*ppszRE);
  1002. if (!MatchToken(ppszRE, '-'))
  1003. {
  1004. *pchEndChar = *pchStartChar;
  1005. return 0;
  1006. }
  1007. // check for unterminated range
  1008. if (!**ppszRE || PeekToken(ppszRE, ']'))
  1009. {
  1010. SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED);
  1011. return -1;
  1012. }
  1013. *pchEndChar = **ppszRE;
  1014. *ppszRE = CharTraits::Next(*ppszRE);
  1015. if (*pchEndChar < *pchStartChar)
  1016. {
  1017. SetLastParseError(REPARSE_ERROR_INVALID_RANGE);
  1018. return -1;
  1019. }
  1020. return 0;
  1021. }
  1022. int AddInstructions(int nNumInstructions)
  1023. {
  1024. size_t nCurr = m_Instructions.GetCount();
  1025. if (!m_Instructions.SetCount(nCurr+nNumInstructions))
  1026. {
  1027. SetLastParseError(REPARSE_ERROR_OUTOFMEMORY);
  1028. return -1;
  1029. }
  1030. return (int) nCurr;
  1031. }
  1032. // ParseCharSet: parse grammar rule CharSet
  1033. int ParseCharSet(const RECHAR **ppszRE, BOOL bNot)
  1034. {
  1035. int p = -1;
  1036. unsigned char *pBits = NULL;
  1037. if (CharTraits::UseBitFieldForRange())
  1038. {
  1039. // we use a bit field to represent the characters
  1040. // a 1 bit means match against the character
  1041. // the last 5 bits are used as an index into
  1042. // the byte array, and the first 3 bits
  1043. // are used to index into the selected byte
  1044. p = AddInstruction(bNot ? RE_NOTRANGE : RE_RANGE);
  1045. if (p < 0)
  1046. return -1;
  1047. // add the required space to hold the character
  1048. // set. We use one bit per character for ansi
  1049. if (AddInstructions(InstructionsPerRangeBitField()) < 0)
  1050. return -1;
  1051. pBits = (unsigned char *) (&m_Instructions[p+1]);
  1052. memset(pBits, 0x00, 256/8);
  1053. }
  1054. else
  1055. {
  1056. p = AddInstruction(bNot ? RE_NOTRANGE_EX : RE_RANGE_EX);
  1057. if (p < 0)
  1058. return -1;
  1059. }
  1060. RECHAR chStart;
  1061. RECHAR chEnd;
  1062. while (**ppszRE && **ppszRE != ']')
  1063. {
  1064. if (ParseCharItem(ppszRE, &chStart, &chEnd))
  1065. return -1;
  1066. if (CharTraits::UseBitFieldForRange())
  1067. {
  1068. for (int i=chStart; i<=chEnd; i++)
  1069. pBits[i >> 3] |= 1 << (i & 0x7);
  1070. }
  1071. else
  1072. {
  1073. int nStart = AddInstruction(RE_NOP);
  1074. if (nStart < 0)
  1075. return -1;
  1076. int nEnd = AddInstruction(RE_NOP);
  1077. if (nEnd < 0)
  1078. return -1;
  1079. GetInstruction(nStart).memory.nIndex = (int) chStart;
  1080. GetInstruction(nEnd).memory.nIndex = (int) chEnd;
  1081. }
  1082. }
  1083. if (!CharTraits::UseBitFieldForRange())
  1084. GetInstruction(p).range.nTarget = m_Instructions.GetCount();
  1085. return p;
  1086. }
  1087. // ParseCharClass: parse grammar rule CharClass
  1088. int ParseCharClass(const RECHAR **ppszRE, bool &bEmpty)
  1089. {
  1090. bEmpty = false;
  1091. if (MatchToken(ppszRE, ']'))
  1092. {
  1093. SetLastParseError(REPARSE_ERROR_EMPTY_RANGE);
  1094. return -1;
  1095. }
  1096. BOOL bNot = FALSE;
  1097. if (MatchToken(ppszRE, '^'))
  1098. bNot = TRUE;
  1099. if (MatchToken(ppszRE, ']'))
  1100. {
  1101. SetLastParseError(REPARSE_ERROR_EMPTY_RANGE);
  1102. return -1;
  1103. }
  1104. int p = ParseCharSet(ppszRE, bNot);
  1105. if (p < 0)
  1106. return p;
  1107. if (!MatchToken(ppszRE, ']'))
  1108. {
  1109. SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED);
  1110. return -1;
  1111. }
  1112. return p;
  1113. }
  1114. int AddMemInstruction(REInstructionType type)
  1115. {
  1116. int p = AddInstruction(type);
  1117. if (p < 0)
  1118. return p;
  1119. GetInstruction(p).memory.nIndex = m_uRequiredMem++;
  1120. return p;
  1121. }
  1122. // helper for parsing !SE
  1123. int ParseNot(const RECHAR **ppszRE, bool &bEmpty)
  1124. {
  1125. int nStoreCP = AddMemInstruction(RE_STORE_CHARPOS);
  1126. int nStoreSP = AddMemInstruction(RE_STORE_STACKPOS);
  1127. int nCall = AddInstruction(RE_CALL);
  1128. if (nCall < 0)
  1129. return -1;
  1130. int nGetCP = AddInstruction(RE_GET_CHARPOS);
  1131. if (nGetCP < 0)
  1132. return -1;
  1133. GetInstruction(nGetCP).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex;
  1134. int nGetSP = AddInstruction(RE_GET_STACKPOS);
  1135. if (nGetSP < 0)
  1136. return -1;
  1137. GetInstruction(nGetSP).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex;
  1138. int nJmp = AddInstruction(RE_JMP);
  1139. if (nJmp < 0)
  1140. return -1;
  1141. int nSE = ParseSE(ppszRE, bEmpty);
  1142. if (nSE < 0)
  1143. return nSE;
  1144. // patch the call
  1145. GetInstruction(nCall).call.nTarget = nSE;
  1146. int nGetCP1 = AddInstruction(RE_GET_CHARPOS);
  1147. if (nGetCP1 < 0)
  1148. return -1;
  1149. GetInstruction(nGetCP1).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex;
  1150. int nGetSP1 = AddInstruction(RE_GET_STACKPOS);
  1151. if (nGetSP1 < 0)
  1152. return -1;
  1153. GetInstruction(nGetSP1).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex;
  1154. int nRet = AddInstruction(RE_RETURN);
  1155. if (nRet < 0)
  1156. return -1;
  1157. GetInstruction(nJmp).jmp.nTarget = nRet+1;
  1158. return nStoreCP;
  1159. }
  1160. // ParseAbbrev: parse grammar rule Abbrev
  1161. int ParseAbbrev(const RECHAR **ppszRE, bool &bEmpty)
  1162. {
  1163. const RECHAR **szAbbrevs = CharTraits::GetAbbrevs();
  1164. while (*szAbbrevs)
  1165. {
  1166. if (**ppszRE == **szAbbrevs)
  1167. {
  1168. const RECHAR *szAbbrev = (*szAbbrevs)+1;
  1169. int p = ParseE(&szAbbrev, bEmpty);
  1170. if (p < 0)
  1171. {
  1172. SetLastParseError(REPARSE_ERROR_UNEXPECTED);
  1173. return p;
  1174. }
  1175. *ppszRE = CharTraits::Next(*ppszRE);
  1176. return p;
  1177. }
  1178. szAbbrevs++;
  1179. }
  1180. return -1;
  1181. }
  1182. // ParseSE: parse grammar rule SE (simple expression)
  1183. int ParseSE(const RECHAR **ppszRE, bool &bEmpty)
  1184. {
  1185. if (MatchToken(ppszRE, '{'))
  1186. return ParseArg(ppszRE, bEmpty);
  1187. if (MatchToken(ppszRE, '('))
  1188. return ParseGroup(ppszRE, bEmpty);
  1189. if (MatchToken(ppszRE, '['))
  1190. return ParseCharClass(ppszRE, bEmpty);
  1191. if (MatchToken(ppszRE, '\\'))
  1192. {
  1193. if (!CharTraits::Isdigit(**ppszRE))
  1194. {
  1195. // check for abbreviations
  1196. int p;
  1197. p = ParseAbbrev(ppszRE, bEmpty);
  1198. if (p >= 0)
  1199. return p;
  1200. if (GetLastParseError())
  1201. return -1;
  1202. // escaped char
  1203. p = AddInstruction(RE_SYMBOL);
  1204. if (p < 0)
  1205. return -1;
  1206. GetInstruction(p).symbol.nSymbol = (int) **ppszRE;
  1207. *ppszRE = CharTraits::Next(*ppszRE);
  1208. return p;
  1209. }
  1210. // previous match
  1211. bEmpty = false;
  1212. int nPrev = AddInstruction(RE_PREVIOUS);
  1213. if (nPrev < 0)
  1214. return -1;
  1215. UINT uValue = (UINT) CharTraits::Strtol(*ppszRE, (RECHAR **) ppszRE, 10);
  1216. if (uValue >= m_uNumGroups)
  1217. {
  1218. SetLastParseError(REPARSE_ERROR_INVALID_GROUP);
  1219. return -1;
  1220. }
  1221. GetInstruction(nPrev).prev.nGroup = (size_t) uValue;
  1222. return nPrev;
  1223. }
  1224. if (MatchToken(ppszRE, '!'))
  1225. return ParseNot(ppszRE, bEmpty);
  1226. if (**ppszRE == '}' || **ppszRE == ']' || **ppszRE == ')')
  1227. {
  1228. return -1;
  1229. }
  1230. if (**ppszRE == '\0')
  1231. {
  1232. return -1;
  1233. }
  1234. int p;
  1235. if (**ppszRE == '.')
  1236. {
  1237. p = AddInstruction(RE_ANY);
  1238. if (p < 0)
  1239. return -1;
  1240. bEmpty = false;
  1241. }
  1242. else if (**ppszRE == '$' && (*ppszRE)[1] == '\0')
  1243. {
  1244. p = AddInstruction(RE_SYMBOL);
  1245. if (p < 0)
  1246. return -1;
  1247. GetInstruction(p).symbol.nSymbol = 0;
  1248. bEmpty = false;
  1249. }
  1250. else
  1251. {
  1252. p = AddInstruction(RE_SYMBOL);
  1253. if (p < 0)
  1254. return -1;
  1255. GetInstruction(p).symbol.nSymbol = (int) **ppszRE;
  1256. bEmpty = false;
  1257. }
  1258. *ppszRE = CharTraits::Next(*ppszRE);
  1259. return p;
  1260. }
  1261. // ParseE: parse grammar rule E (expression)
  1262. int ParseE(const RECHAR **ppszRE, bool &bEmpty)
  1263. {
  1264. CParseState ParseState(this);
  1265. const RECHAR *sz = *ppszRE;
  1266. int nSE;
  1267. int nFirst = ParseSE(ppszRE, bEmpty);
  1268. if (nFirst < 0)
  1269. return nFirst;
  1270. REInstructionType type = RE_MATCH;
  1271. if (MatchToken(ppszRE, '*'))
  1272. if(MatchToken(ppszRE, '?'))
  1273. type = RE_NG_STAR_BEGIN;
  1274. else
  1275. type = RE_STAR_BEGIN;
  1276. else if (MatchToken(ppszRE, '+'))
  1277. if(MatchToken(ppszRE, '?'))
  1278. type = RE_NG_PLUS;
  1279. else
  1280. type = RE_PLUS;
  1281. else if (MatchToken(ppszRE, '?'))
  1282. if(MatchToken(ppszRE, '?'))
  1283. type = RE_NG_QUESTION;
  1284. else
  1285. type = RE_QUESTION;
  1286. if (type == RE_MATCH)
  1287. return nFirst;
  1288. if (type == RE_STAR_BEGIN || type == RE_QUESTION|| type == RE_NG_STAR_BEGIN || type == RE_NG_QUESTION)
  1289. {
  1290. ParseState.Restore(this);
  1291. }
  1292. else
  1293. {
  1294. m_uNumGroups = ParseState.m_uNumGroups;
  1295. }
  1296. *ppszRE = sz;
  1297. int nE;
  1298. if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS || type == RE_NG_QUESTION) // Non-Greedy
  1299. {
  1300. int nCall = AddInstruction(RE_CALL);
  1301. if (nCall < 0)
  1302. return -1;
  1303. bEmpty = false;
  1304. nSE = ParseSE(ppszRE, bEmpty);
  1305. if (nSE < 0)
  1306. return nSE;
  1307. if (bEmpty && (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS))
  1308. {
  1309. SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP);
  1310. return -1;
  1311. }
  1312. bEmpty = true;
  1313. *ppszRE = CharTraits::Next(*ppszRE);
  1314. *ppszRE = CharTraits::Next(*ppszRE);
  1315. if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS)
  1316. {
  1317. int nJmp = AddInstruction(RE_JMP);
  1318. if (nJmp < 0)
  1319. return -1;
  1320. GetInstruction(nCall).call.nTarget = nJmp+1;
  1321. GetInstruction(nJmp).jmp.nTarget = nCall;
  1322. }
  1323. else
  1324. GetInstruction(nCall).call.nTarget = nSE+1;
  1325. if (type == RE_NG_PLUS)
  1326. nE = nFirst;
  1327. else
  1328. nE = nCall;
  1329. }
  1330. else // Greedy
  1331. {
  1332. int nPushMem = AddInstruction(RE_PUSH_MEMORY);
  1333. if (nPushMem < 0)
  1334. return -1;
  1335. int nStore = AddInstruction(RE_STORE_CHARPOS);
  1336. if (nStore < 0)
  1337. return -1;
  1338. if (AddInstruction(RE_PUSH_CHARPOS) < 0)
  1339. return -1;
  1340. int nCall = AddInstruction(RE_CALL);
  1341. if (nCall < 0)
  1342. return -1;
  1343. if (AddInstruction(RE_POP_CHARPOS) < 0)
  1344. return -1;
  1345. int nPopMem = AddInstruction(RE_POP_MEMORY);
  1346. if (nPopMem < 0)
  1347. return -1;
  1348. int nJmp = AddInstruction(RE_JMP);
  1349. if (nJmp < 0)
  1350. return -1;
  1351. GetInstruction(nPushMem).memory.nIndex = m_uRequiredMem++;
  1352. GetInstruction(nStore).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
  1353. GetInstruction(nCall).call.nTarget = nJmp+1;
  1354. GetInstruction(nPopMem).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
  1355. bEmpty = false;
  1356. nSE = ParseSE(ppszRE, bEmpty);
  1357. if (nSE < 0)
  1358. return nSE;
  1359. if (bEmpty && (type == RE_STAR_BEGIN || type == RE_PLUS))
  1360. {
  1361. SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP);
  1362. return -1;
  1363. }
  1364. if (type != RE_PLUS && type != RE_NG_PLUS)
  1365. bEmpty = true;
  1366. *ppszRE = CharTraits::Next(*ppszRE);
  1367. int nRetNoMatch = AddInstruction(RE_RET_NOMATCH);
  1368. if (nRetNoMatch < 0)
  1369. return -1;
  1370. int nStore1 = AddInstruction(RE_STORE_CHARPOS);
  1371. if (nStore1 < 0)
  1372. return -1;
  1373. GetInstruction(nRetNoMatch).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
  1374. GetInstruction(nStore1).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
  1375. if (type != RE_QUESTION)
  1376. {
  1377. int nJmp1 = AddInstruction(RE_JMP);
  1378. if (nJmp1 < 0)
  1379. return -1;
  1380. GetInstruction(nJmp1).jmp.nTarget = nPushMem;
  1381. }
  1382. GetInstruction(nJmp).jmp.nTarget = m_Instructions.GetCount();
  1383. if (type == RE_PLUS)
  1384. nE = nFirst;
  1385. else
  1386. nE = nPushMem;
  1387. }
  1388. return nE;
  1389. }
  1390. // ParseAltE: parse grammar rule AltE
  1391. int ParseAltE(const RECHAR **ppszRE, bool &bEmpty)
  1392. {
  1393. const RECHAR *sz = *ppszRE;
  1394. CParseState ParseState(this);
  1395. int nPush = AddInstruction(RE_PUSH_CHARPOS);
  1396. if (nPush < 0)
  1397. return -1;
  1398. int nCall = AddInstruction(RE_CALL);
  1399. if (nCall < 0)
  1400. return -1;
  1401. GetInstruction(nCall).call.nTarget = nPush+4;
  1402. if (AddInstruction(RE_POP_CHARPOS) < 0)
  1403. return -1;
  1404. int nJmpNext = AddInstruction(RE_JMP);
  1405. if (nJmpNext < 0)
  1406. return -1;
  1407. int nE = ParseE(ppszRE, bEmpty);
  1408. if (nE < 0)
  1409. {
  1410. if (GetLastParseError())
  1411. return -1;
  1412. ParseState.Restore(this);
  1413. return nE;
  1414. }
  1415. int nJmpEnd = AddInstruction(RE_JMP);
  1416. if (nJmpEnd < 0)
  1417. return -1;
  1418. GetInstruction(nJmpNext).jmp.nTarget = nJmpEnd+1;
  1419. if (!MatchToken(ppszRE, '|'))
  1420. {
  1421. ParseState.Restore(this);
  1422. *ppszRE = sz;
  1423. return ParseE(ppszRE, bEmpty);
  1424. }
  1425. bool bEmptyAltE;
  1426. int nAltE = ParseAltE(ppszRE, bEmptyAltE);
  1427. GetInstruction(nJmpEnd).jmp.nTarget = m_Instructions.GetCount();
  1428. GetInstruction(nJmpNext).jmp.nTarget = nAltE;
  1429. if (nAltE < 0)
  1430. {
  1431. if (GetLastParseError())
  1432. return -1;
  1433. ParseState.Restore(this);
  1434. return nAltE;
  1435. }
  1436. bEmpty = bEmpty | bEmptyAltE;
  1437. return nPush;
  1438. }
  1439. // ParseRE: parse grammar rule RE (regular expression)
  1440. int ParseRE(const RECHAR **ppszRE, bool &bEmpty)
  1441. {
  1442. if (**ppszRE == '\0')
  1443. return -1;
  1444. int p = ParseAltE(ppszRE, bEmpty);
  1445. if (p < 0)
  1446. return p;
  1447. bool bEmptyRE = true;
  1448. ParseRE(ppszRE, bEmptyRE);
  1449. if (GetLastParseError())
  1450. return -1;
  1451. bEmpty = bEmpty && bEmptyRE;
  1452. return p;
  1453. }
  1454. //pointers to the matched string and matched groups, currently point into an internal allocated
  1455. //buffer that hold a copy of the input string.
  1456. //This function fix these pointers to point into the original, user supplied buffer (first param to Match method).
  1457. //Example: If a ptr (szStart) currently point to <internal buffer>+3, it is fixed to <user supplied buffer>+3
  1458. void FixupMatchContext(CAtlREMatchContext<CharTraits> *pContext, const RECHAR *szOrig, const RECHAR *szNew)
  1459. {
  1460. ATLENSURE(pContext);
  1461. ATLASSERT(szOrig);
  1462. ATLASSERT(szNew);
  1463. pContext->m_Match.szStart = szOrig + (pContext->m_Match.szStart - szNew);
  1464. pContext->m_Match.szEnd = szOrig + (pContext->m_Match.szEnd - szNew);
  1465. for (UINT i=0; i<pContext->m_uNumGroups; i++)
  1466. {
  1467. if (pContext->m_Matches[i].szStart==NULL || pContext->m_Matches[i].szEnd==NULL)
  1468. {
  1469. continue; //Do not fix unmatched groups.
  1470. }
  1471. pContext->m_Matches[i].szStart = szOrig + (pContext->m_Matches[i].szStart - szNew);
  1472. pContext->m_Matches[i].szEnd = szOrig + (pContext->m_Matches[i].szEnd - szNew);
  1473. }
  1474. }
  1475. // implementation
  1476. // helpers for dumping and debugging the rx engine
  1477. public:
  1478. #ifdef ATL_REGEXP_DUMP
  1479. size_t DumpInstruction(size_t ip)
  1480. {
  1481. printf("%08x ", ip);
  1482. switch (GetInstruction(ip).type)
  1483. {
  1484. case RE_NOP:
  1485. printf("NOP\n");
  1486. ip++;
  1487. break;
  1488. case RE_SYMBOL:
  1489. AtlprintfT<RECHAR>(CAToREChar<RECHAR>("Symbol %c\n"),GetInstruction(ip).symbol.nSymbol);
  1490. ip++;
  1491. break;
  1492. case RE_ANY:
  1493. printf("Any\n");
  1494. ip++;
  1495. break;
  1496. case RE_RANGE:
  1497. printf("Range\n");
  1498. ip++;
  1499. ip += InstructionsPerRangeBitField();
  1500. break;
  1501. case RE_NOTRANGE:
  1502. printf("NOT Range\n");
  1503. ip++;
  1504. ip += InstructionsPerRangeBitField();
  1505. break;
  1506. case RE_RANGE_EX:
  1507. printf("RangeEx %08x\n", GetInstruction(ip).range.nTarget);
  1508. ip++;
  1509. break;
  1510. case RE_NOTRANGE_EX:
  1511. printf("NotRangeEx %08x\n", GetInstruction(ip).range.nTarget);
  1512. ip++;
  1513. break;
  1514. case RE_GROUP_START:
  1515. printf("Start group %d\n", GetInstruction(ip).group.nGroup);
  1516. ip++;
  1517. break;
  1518. case RE_GROUP_END:
  1519. printf("Group end %d\n", GetInstruction(ip).group.nGroup);
  1520. ip++;
  1521. break;
  1522. case RE_PUSH_CHARPOS:
  1523. printf("Push char pos\n");
  1524. ip++;
  1525. break;
  1526. case RE_POP_CHARPOS:
  1527. printf("Pop char pos\n");
  1528. ip++;
  1529. break;
  1530. case RE_STORE_CHARPOS:
  1531. printf("Store char pos %d\n", GetInstruction(ip).memory.nIndex);
  1532. ip++;
  1533. break;
  1534. case RE_GET_CHARPOS:
  1535. printf("Get char pos %d\n", GetInstruction(ip).memory.nIndex);
  1536. ip++;
  1537. break;
  1538. case RE_STORE_STACKPOS:
  1539. printf("Store stack pos %d\n", GetInstruction(ip).memory.nIndex);
  1540. ip++;
  1541. break;
  1542. case RE_GET_STACKPOS:
  1543. printf("Get stack pos %d\n", GetInstruction(ip).memory.nIndex);
  1544. ip++;
  1545. break;
  1546. case RE_CALL:
  1547. printf("Call %08x\n", GetInstruction(ip).call.nTarget);
  1548. ip++;
  1549. break;
  1550. case RE_JMP:
  1551. printf("Jump %08x\n", GetInstruction(ip).jmp.nTarget);
  1552. ip++;
  1553. break;
  1554. case RE_RETURN:
  1555. printf("return\n");
  1556. ip++;
  1557. break;
  1558. case RE_PUSH_MEMORY:
  1559. printf("Push memory %08x\n", GetInstruction(ip).memory.nIndex);
  1560. ip++;
  1561. break;
  1562. case RE_POP_MEMORY:
  1563. printf("Pop memory %08x\n", GetInstruction(ip).memory.nIndex);
  1564. ip++;
  1565. break;
  1566. case RE_RET_NOMATCH:
  1567. printf("Return no match %08x\n", GetInstruction(ip).memory.nIndex);
  1568. ip++;
  1569. break;
  1570. case RE_MATCH:
  1571. printf("END\n");
  1572. ip++;
  1573. break;
  1574. case RE_ADVANCE:
  1575. printf("ADVANCE\n");
  1576. ip++;
  1577. break;
  1578. case RE_FAIL:
  1579. printf("FAIL\n");
  1580. ip++;
  1581. break;
  1582. case RE_PREVIOUS:
  1583. printf("Prev %d\n", GetInstruction(ip).prev.nGroup);
  1584. ip++;
  1585. break;
  1586. case RE_PUSH_GROUP:
  1587. printf("Push group %d\n", GetInstruction(ip).group.nGroup);
  1588. ip++;
  1589. break;
  1590. case RE_POP_GROUP:
  1591. printf("Pop group %d\n", GetInstruction(ip).group.nGroup);
  1592. ip++;
  1593. break;
  1594. default:
  1595. printf("????\n");
  1596. ip++;
  1597. break;
  1598. }
  1599. return ip;
  1600. }
  1601. void Dump(size_t ipCurrent = 0)
  1602. {
  1603. size_t ip = 0;
  1604. while (ip < m_Instructions.GetCount())
  1605. {
  1606. if (ip == ipCurrent)
  1607. printf("->");
  1608. ip = DumpInstruction(ip);
  1609. }
  1610. }
  1611. #endif
  1612. #ifdef ATLRX_DEBUG
  1613. void cls( HANDLE hConsole )
  1614. {
  1615. COORD coordScreen = { 0, 0 }; /* here's where we'll home the
  1616. cursor */
  1617. BOOL bSuccess;
  1618. DWORD cCharsWritten;
  1619. CONSOLE_SCREEN_BUFFER_INFO csbi; /* to get buffer info */
  1620. DWORD dwConSize; /* number of character cells in
  1621. the current buffer */
  1622. /* get the number of character cells in the current buffer */
  1623. bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi );
  1624. dwConSize = csbi.dwSize.X * csbi.dwSize.Y;
  1625. /* fill the entire screen with blanks */
  1626. bSuccess = FillConsoleOutputCharacter( hConsole, (TCHAR) ' ',
  1627. dwConSize, coordScreen, &cCharsWritten );
  1628. /* get the current text attribute */
  1629. bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi );
  1630. /* now set the buffer's attributes accordingly */
  1631. bSuccess = FillConsoleOutputAttribute( hConsole, csbi.wAttributes,
  1632. dwConSize, coordScreen, &cCharsWritten );
  1633. /* put the cursor at (0, 0) */
  1634. bSuccess = SetConsoleCursorPosition( hConsole, coordScreen );
  1635. return;
  1636. }
  1637. void DumpStack(CAtlREMatchContext<CharTraits> *pContext)
  1638. {
  1639. for (size_t i=pContext->m_nTos; i>0; i--)
  1640. {
  1641. if (pContext->m_stack[i] < (void *) m_Instructions.GetCount())
  1642. printf("0x%p\n", pContext->m_stack[i]);
  1643. else
  1644. {
  1645. // assume a pointer into the input
  1646. AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%s\n"), pContext->m_stack[i]);
  1647. }
  1648. }
  1649. }
  1650. void DumpMemory(CAtlREMatchContext<CharTraits> *pContext)
  1651. {
  1652. for (UINT i=0; i<m_uRequiredMem; i++)
  1653. {
  1654. AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%d: %s\n"), i, pContext->m_Mem.m_p[i]);
  1655. }
  1656. }
  1657. virtual void OnDebugEvent(size_t ip, const RECHAR *szIn, const RECHAR *sz, CAtlREMatchContext<CharTraits> *pContext)
  1658. {
  1659. cls(GetStdHandle(STD_OUTPUT_HANDLE));
  1660. printf("----------Code---------\n");
  1661. Dump(ip);
  1662. printf("----------Input---------\n");
  1663. AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%s\n"), szIn);
  1664. for (int s=0; szIn+s < sz; s++)
  1665. {
  1666. printf(" ");
  1667. }
  1668. printf("^\n");
  1669. printf("----------Memory---------\n");
  1670. DumpMemory(pContext);
  1671. printf("----------Stack---------\n");
  1672. DumpStack(pContext);
  1673. getchar();
  1674. }
  1675. #endif
  1676. };
  1677. } // namespace ATL
  1678. #pragma pack(pop)
  1679. #endif // __ATLRX_H__