xref: /haiku/src/kits/shared/RegExp.cpp (revision 71452e98334eaac603bf542d159e24788a46bebb)
1 /*
2  * Copyright 2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2013, Rene Gollent, rene@gollent.com.
4  * Distributed under the terms of the MIT License.
5  */
6 
7 
8 #include <RegExp.h>
9 
10 #include <new>
11 
12 #include <regex.h>
13 
14 #include <String.h>
15 
16 #include <Referenceable.h>
17 
18 
19 // #pragma mark - RegExp::Data
20 
21 
22 struct RegExp::Data : public BReferenceable {
23 	Data(const char* pattern, PatternType patternType, bool caseSensitive)
24 		:
25 		BReferenceable()
26 	{
27 		// convert the shell pattern to a regular expression
28 		BString patternString;
29 		if (patternType == PATTERN_TYPE_WILDCARD) {
30 			while (*pattern != '\0') {
31 				char c = *pattern++;
32 				switch (c) {
33 					case '?':
34 						patternString += '.';
35 						continue;
36 					case '*':
37 						patternString += ".*";
38 						continue;
39 					case '[':
40 					{
41 						// find the matching ']' first
42 						const char* end = pattern;
43 						while (*end != ']') {
44 							if (*end++ == '\0') {
45 								fError = REG_EBRACK;
46 								return;
47 							}
48 						}
49 
50 						if (pattern == end) {
51 							// Empty bracket expression. It will never match
52 							// anything. Strictly speaking this is not
53 							// considered an error, but we handle it like one.
54 							fError = REG_EBRACK;
55 							return;
56 						}
57 
58 						patternString += '[';
59 
60 						// We need to avoid "[." ... ".]", "[=" ... "=]", and
61 						// "[:" ... ":]" sequences, since those have special
62 						// meaning in regular expressions. If we encounter
63 						// a '[' followed by either of '.', '=', or ':', we
64 						// replace the '[' by "[.[.]".
65 						while (pattern < end) {
66 							c = *pattern++;
67 							if (c == '[' && pattern < end) {
68 								switch (*pattern) {
69 									case '.':
70 									case '=':
71 									case ':':
72 										patternString += "[.[.]";
73 										continue;
74 								}
75 							}
76 							patternString += c;
77 						}
78 
79 						pattern++;
80 						patternString += ']';
81 						break;
82 					}
83 
84 					case '\\':
85 					{
86 						// Quotes the next character. Works the same way for
87 						// regular expressions.
88 						if (*pattern == '\0') {
89 							fError = REG_EESCAPE;
90 							return;
91 						}
92 
93 						patternString += '\\';
94 						patternString += *pattern++;
95 						break;
96 					}
97 
98 					case '^':
99 					case '.':
100 					case '$':
101 					case '(':
102 					case ')':
103 					case '|':
104 					case '+':
105 					case '{':
106 						// need to be quoted
107 						patternString += '\\';
108 						// fall through
109 					default:
110 						patternString += c;
111 						break;
112 				}
113 			}
114 
115 			pattern = patternString.String();
116 		}
117 
118 		int flags = REG_EXTENDED;
119 		if (!caseSensitive)
120 			flags |= REG_ICASE;
121 
122 		fError = regcomp(&fCompiledExpression, pattern, flags);
123 	}
124 
125 	~Data()
126 	{
127 		if (fError == 0)
128 			regfree(&fCompiledExpression);
129 	}
130 
131 	bool IsValid() const
132 	{
133 		return fError == 0;
134 	}
135 
136 	const regex_t* CompiledExpression() const
137 	{
138 		return &fCompiledExpression;
139 	}
140 
141 private:
142 	int		fError;
143 	regex_t	fCompiledExpression;
144 };
145 
146 
147 // #pragma mark - RegExp::MatchResultData
148 
149 
150 struct RegExp::MatchResultData : public BReferenceable {
151 	MatchResultData(const regex_t* compiledExpression, const char* string)
152 		:
153 		BReferenceable(),
154 		fMatchCount(0),
155 		fMatches(NULL)
156 	{
157 		// fMatchCount is always set to the number of matching groups in the
158 		// expression (or 0 if an error occured). Some of the "matches" in
159 		// the array may still point to the (-1,-1) range if they don't
160 		// actually match anything.
161 		fMatchCount = compiledExpression->re_nsub + 1;
162 		fMatches = new regmatch_t[fMatchCount];
163 		if (regexec(compiledExpression, string, fMatchCount, fMatches, 0)
164 				!= 0) {
165 			delete[] fMatches;
166 			fMatches = NULL;
167 			fMatchCount = 0;
168 		}
169 	}
170 
171 	~MatchResultData()
172 	{
173 		delete[] fMatches;
174 	}
175 
176 	size_t MatchCount() const
177 	{
178 		return fMatchCount;
179 	}
180 
181 	const regmatch_t* Matches() const
182 	{
183 		return fMatches;
184 	}
185 
186 private:
187 	size_t		fMatchCount;
188 	regmatch_t*	fMatches;
189 };
190 
191 
192 // #pragma mark - RegExp
193 
194 
195 RegExp::RegExp()
196 	:
197 	fData(NULL)
198 {
199 }
200 
201 
202 RegExp::RegExp(const char* pattern, PatternType patternType,
203 	bool caseSensitive)
204 	:
205 	fData(NULL)
206 {
207 	SetPattern(pattern, patternType, caseSensitive);
208 }
209 
210 
211 RegExp::RegExp(const RegExp& other)
212 	:
213 	fData(other.fData)
214 {
215 	if (fData != NULL)
216 		fData->AcquireReference();
217 }
218 
219 
220 RegExp::~RegExp()
221 {
222 	if (fData != NULL)
223 		fData->ReleaseReference();
224 }
225 
226 
227 bool
228 RegExp::SetPattern(const char* pattern, PatternType patternType,
229 	bool caseSensitive)
230 {
231 	if (fData != NULL) {
232 		fData->ReleaseReference();
233 		fData = NULL;
234 	}
235 
236 	Data* newData = new(std::nothrow) Data(pattern, patternType, caseSensitive);
237 	if (newData == NULL)
238 		return false;
239 
240 	BReference<Data> dataReference(newData, true);
241 	if (!newData->IsValid())
242 		return false;
243 
244 	fData = dataReference.Detach();
245 	return true;
246 }
247 
248 
249 RegExp::MatchResult
250 RegExp::Match(const char* string) const
251 {
252 	if (!IsValid())
253 		return MatchResult();
254 
255 	return MatchResult(
256 		new(std::nothrow) MatchResultData(fData->CompiledExpression(),
257 			string));
258 }
259 
260 
261 RegExp&
262 RegExp::operator=(const RegExp& other)
263 {
264 	if (fData != NULL)
265 		fData->ReleaseReference();
266 
267 	fData = other.fData;
268 
269 	if (fData != NULL)
270 		fData->AcquireReference();
271 
272 	return *this;
273 }
274 
275 
276 // #pragma mark - RegExp::MatchResult
277 
278 
279 RegExp::MatchResult::MatchResult()
280 	:
281 	fData(NULL)
282 {
283 }
284 
285 
286 RegExp::MatchResult::MatchResult(MatchResultData* data)
287 	:
288 	fData(data)
289 {
290 }
291 
292 
293 RegExp::MatchResult::MatchResult(const MatchResult& other)
294 	:
295 	fData(other.fData)
296 {
297 	if (fData != NULL)
298 		fData->AcquireReference();
299 }
300 
301 
302 RegExp::MatchResult::~MatchResult()
303 {
304 	if (fData != NULL)
305 		fData->ReleaseReference();
306 }
307 
308 
309 bool
310 RegExp::MatchResult::HasMatched() const
311 {
312 	return fData != NULL && fData->MatchCount() > 0;
313 }
314 
315 
316 size_t
317 RegExp::MatchResult::StartOffset() const
318 {
319 	return fData != NULL && fData->MatchCount() > 0
320 		? fData->Matches()[0].rm_so : 0;
321 }
322 
323 
324 size_t
325 RegExp::MatchResult::EndOffset() const
326 {
327 	return fData != NULL && fData->MatchCount() > 0
328 		? fData->Matches()[0].rm_eo : 0;
329 }
330 
331 
332 size_t
333 RegExp::MatchResult::GroupCount() const
334 {
335 	if (fData == NULL)
336 		return 0;
337 
338 	size_t matchCount = fData->MatchCount();
339 	return matchCount > 0 ? matchCount - 1 : 0;
340 }
341 
342 
343 size_t
344 RegExp::MatchResult::GroupStartOffsetAt(size_t index) const
345 {
346 	return fData != NULL && fData->MatchCount() > index + 1
347 		? fData->Matches()[index + 1].rm_so : 0;
348 }
349 
350 
351 size_t
352 RegExp::MatchResult::GroupEndOffsetAt(size_t index) const
353 {
354 	return fData != NULL && fData->MatchCount() > index + 1
355 		? fData->Matches()[index + 1].rm_eo : 0;
356 }
357 
358 
359 RegExp::MatchResult&
360 RegExp::MatchResult::operator=(const MatchResult& other)
361 {
362 	if (fData != NULL)
363 		fData->ReleaseReference();
364 
365 	fData = other.fData;
366 
367 	if (fData != NULL)
368 		fData->AcquireReference();
369 
370 	return *this;
371 }
372