00001 //================================================================================================== 00002 // Copyright (C) 2010 Brian Tietz sdbtietz at yahoo dot com 00003 // 00004 // This program is free software; you can redistribute it and/or modify it under the terms of the 00005 // GNU General Public License as published by the Free Software Foundation, version 2.0 of the 00006 // License. 00007 // 00008 // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without 00009 // even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00010 // General Public License for more details. 00011 // 00012 // You should have received a copy of the GNU General Public License along with this program; if 00013 // not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 00014 // 02110-1301, USA. 00015 // 00016 // For commercial software, the copyright holder (Brian Tietz, email sdbtietz at yahoo dot com) 00017 // reserves the right and is willing to waive the proprietary source code disclosure aspects of that 00018 // license as applied to the UT library in exchange for either substantial contributions to the 00019 // development of the UT library or other forms of compensation. Any such waiver must be 00020 // established in writing between the copyright holder and the commercial entity obtaining such a 00021 // waiver. 00022 //================================================================================================== 00023 00024 00025 #ifndef _UT_REG_EXP_SUPPORT_H_ 00026 #define _UT_REG_EXP_SUPPORT_H_ 00027 00028 // \cond DOXYGEN_DOCUMENT_NEVER 00029 00030 00031 //================================================================================================== 00032 //=== Project headers 00033 //================================================================================================== 00034 #include "UT.h" 00035 00036 00037 //================================================================================================== 00038 //=== Constants 00039 //================================================================================================== 00040 enum regexp_normal_seq_context_t 00041 { 00042 eRNSC_called_from_root, 00043 eRNSC_called_from_aggregate 00044 }; 00045 00046 00047 //================================================================================================== 00048 class RegExpEvaluator_t 00049 //================================================================================================== 00050 { 00051 //---------------------------------------------------------------------------------------------- 00052 public: 00053 //---------------------------------------------------------------------------------------------- 00054 Status_t Evaluate( const utf8* expression, 00055 out int* expression_fail_point_chars, 00056 const utf8* match, 00057 out int* match_fail_pos_chars, 00058 bool force_start_match, 00059 out String_t* pre, 00060 bool force_end_match, 00061 out String_t* post, 00062 out String_t** substring_array, 00063 int substring_count ); 00064 00065 //---------------------------------------------------------------------------------------------- 00066 private: 00067 //---------------------------------------------------------------------------------------------- 00068 Status_t ProcessNormalSequence(regexp_normal_seq_context_t context); 00069 Status_t ProcessEscapeSequence(); 00070 Status_t ExtractCountControl( out int* min_count, out int* max_count ); 00071 Status_t ExtractType(); 00072 Status_t ProcessAggregate(); 00073 00074 //---------------------------------------------------------------------------------------------- 00075 private: 00076 //---------------------------------------------------------------------------------------------- 00077 const utf8* m_expression; 00078 const utf8* m_match_N; // Can be NULL 00079 }; 00080 00081 00082 enum regexp_type_t 00083 { 00084 eRET_whitespace = 's', // /s (space, tab, carriage return, linefeed) 00085 eRET_numeric_digit = 'd', // /d (0-9) 00086 eRET_lowercase_hex = 'h', // /h (0-9,a-f) 00087 eRET_uppercase_hex = 'H', // /H (0-9,A-F) 00088 eRET_anycase_hex = 'x', // /ih (0-9,a-f,A-f) 00089 eRET_lowercase_letter = 'c', // /c (a-z) 00090 eRET_uppercase_letter = 'C', // /C (A-Z) 00091 eRET_anycase_letter = 'i', // /ic (a-z,A-Z) 00092 eRET_letter_or_above_ascii = 'U', // /U (a-z,A-Z,non-ASCII UTF8) 00093 eRET_letter_number_or_above_ascii = 'n', // /nU (0-9,a-z,A-Z,non-ASCII UTF8) 00094 eRET_token_character = 't', // /t (a-z,A-Z,0-9,_) 00095 eRET_token_start_character = 'T', // /T (a-z,A-Z,_) 00096 eRET_aggregation = '[' // [,] 00097 }; 00098 00099 00100 // \endcond 00101 00102 #endif // _UT_REG_EXP_SUPPORT_H_