xref: /haiku/src/kits/shared/RegExp.cpp (revision 579f1dbca962a2a03df54f69fdc6e9423f91f20e)
1 /*
2  * Copyright 2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2013, Rene Gollent, rene@gollent.com.
4  * Distributed under the terms of the MIT License.
5  */
6 
7 
8 #include <RegExp.h>
9 
10 #include <new>
11 
12 #include <regex.h>
13 
14 #include <String.h>
15 
16 #include <Referenceable.h>
17 
18 
19 // #pragma mark - RegExp::Data
20 
21 
22 struct RegExp::Data : public BReferenceable {
23 	Data(const char* pattern, PatternType patternType, bool caseSensitive)
24 		:
25 		BReferenceable()
26 	{
27 		// convert the shell pattern to a regular expression
28 		BString patternString;
29 		if (patternType == PATTERN_TYPE_WILDCARD) {
30 			while (*pattern != '\0') {
31 				char c = *pattern++;
32 				switch (c) {
33 					case '?':
34 						patternString += '.';
35 						continue;
36 					case '*':
37 						patternString += ".*";
38 						continue;
39 					case '[':
40 					{
41 						// find the matching ']' first
42 						const char* end = pattern;
43 						while (*end != ']') {
44 							if (*end++ == '\0') {
45 								fError = REG_EBRACK;
46 								return;
47 							}
48 						}
49 
50 						if (pattern == end) {
51 							// Empty bracket expression. It will never match
52 							// anything. Strictly speaking this is not
53 							// considered an error, but we handle it like one.
54 							fError = REG_EBRACK;
55 							return;
56 						}
57 
58 						patternString += '[';
59 
60 						// We need to avoid "[." ... ".]", "[=" ... "=]", and
61 						// "[:" ... ":]" sequences, since those have special
62 						// meaning in regular expressions. If we encounter
63 						// a '[' followed by either of '.', '=', or ':', we
64 						// replace the '[' by "[.[.]".
65 						while (pattern < end) {
66 							c = *pattern++;
67 							if (c == '[' && pattern < end) {
68 								switch (*pattern) {
69 									case '.':
70 									case '=':
71 									case ':':
72 										patternString += "[.[.]";
73 										continue;
74 								}
75 							}
76 							patternString += c;
77 						}
78 
79 						pattern++;
80 						patternString += ']';
81 						break;
82 					}
83 
84 					case '\\':
85 					{
86 						// Quotes the next character. Works the same way for
87 						// regular expressions.
88 						if (*pattern == '\0') {
89 							fError = REG_EESCAPE;
90 							return;
91 						}
92 
93 						patternString += '\\';
94 						patternString += *pattern++;
95 						break;
96 					}
97 
98 					case '^':
99 					case '.':
100 					case '$':
101 					case '(':
102 					case ')':
103 					case '|':
104 					case '+':
105 					case '{':
106 						// need to be quoted
107 						patternString += '\\';
108 						// fall through
109 					default:
110 						patternString += c;
111 						break;
112 				}
113 			}
114 
115 			pattern = patternString.String();
116 		}
117 
118 		int flags = REG_EXTENDED;
119 		if (!caseSensitive)
120 			flags |= REG_ICASE;
121 
122 		fError = regcomp(&fCompiledExpression, pattern, flags);
123 	}
124 
125 	~Data()
126 	{
127 		if (fError == 0)
128 			regfree(&fCompiledExpression);
129 	}
130 
131 	bool IsValid() const
132 	{
133 		return fError == 0;
134 	}
135 
136 	const regex_t* CompiledExpression() const
137 	{
138 		return &fCompiledExpression;
139 	}
140 
141 private:
142 	int		fError;
143 	regex_t	fCompiledExpression;
144 };
145 
146 
147 // #pragma mark - RegExp::MatchResultData
148 
149 
150 struct RegExp::MatchResultData : public BReferenceable {
151 	MatchResultData(const regex_t* compiledExpression, const char* string)
152 		:
153 		BReferenceable(),
154 		fMatchCount(0),
155 		fMatches(NULL)
156 	{
157 		// Do the matching: Since we need to provide a buffer for the matches
158 		// for regexec() to fill in, but don't know the number of matches
159 		// beforehand, we need to guess and retry with a larger buffer, if it
160 		// wasn't large enough.
161 		size_t maxMatchCount = 32;
162 		for (;;) {
163 			fMatches = new regmatch_t[maxMatchCount];
164 			if (regexec(compiledExpression, string, maxMatchCount, fMatches, 0)
165 					!= 0) {
166 				delete[] fMatches;
167 				fMatches = NULL;
168 				fMatchCount = 0;
169 				break;
170 			}
171 
172 			if (fMatches[maxMatchCount - 1].rm_so == -1) {
173 				// determine the match count
174 				size_t lower = 0;
175 				size_t upper = maxMatchCount;
176 				while (lower < upper) {
177 					size_t mid = (lower + upper) / 2;
178 					if (fMatches[mid].rm_so == -1)
179 						upper = mid;
180 					else
181 						lower = mid + 1;
182 				}
183 				fMatchCount = lower;
184 				break;
185 			}
186 
187 			// buffer too small -- try again with larger buffer
188 			delete[] fMatches;
189 			fMatches = NULL;
190 			maxMatchCount *= 2;
191 		}
192 	}
193 
194 	~MatchResultData()
195 	{
196 		delete[] fMatches;
197 	}
198 
199 	size_t MatchCount() const
200 	{
201 		return fMatchCount;
202 	}
203 
204 	const regmatch_t* Matches() const
205 	{
206 		return fMatches;
207 	}
208 
209 private:
210 	size_t		fMatchCount;
211 	regmatch_t*	fMatches;
212 };
213 
214 
215 // #pragma mark - RegExp
216 
217 
218 RegExp::RegExp()
219 	:
220 	fData(NULL)
221 {
222 }
223 
224 
225 RegExp::RegExp(const char* pattern, PatternType patternType,
226 	bool caseSensitive)
227 	:
228 	fData(NULL)
229 {
230 	SetPattern(pattern, patternType, caseSensitive);
231 }
232 
233 
234 RegExp::RegExp(const RegExp& other)
235 	:
236 	fData(other.fData)
237 {
238 	if (fData != NULL)
239 		fData->AcquireReference();
240 }
241 
242 
243 RegExp::~RegExp()
244 {
245 	if (fData != NULL)
246 		fData->ReleaseReference();
247 }
248 
249 
250 bool
251 RegExp::SetPattern(const char* pattern, PatternType patternType,
252 	bool caseSensitive)
253 {
254 	if (fData != NULL) {
255 		fData->ReleaseReference();
256 		fData = NULL;
257 	}
258 
259 	Data* newData = new(std::nothrow) Data(pattern, patternType, caseSensitive);
260 	if (newData == NULL)
261 		return false;
262 
263 	BReference<Data> dataReference(newData, true);
264 	if (!newData->IsValid())
265 		return false;
266 
267 	fData = dataReference.Detach();
268 	return true;
269 }
270 
271 
272 RegExp::MatchResult
273 RegExp::Match(const char* string) const
274 {
275 	if (!IsValid())
276 		return MatchResult();
277 
278 	return MatchResult(
279 		new(std::nothrow) MatchResultData(fData->CompiledExpression(),
280 			string));
281 }
282 
283 
284 RegExp&
285 RegExp::operator=(const RegExp& other)
286 {
287 	if (fData != NULL)
288 		fData->ReleaseReference();
289 
290 	fData = other.fData;
291 
292 	if (fData != NULL)
293 		fData->AcquireReference();
294 
295 	return *this;
296 }
297 
298 
299 // #pragma mark - RegExp::MatchResult
300 
301 
302 RegExp::MatchResult::MatchResult()
303 	:
304 	fData(NULL)
305 {
306 }
307 
308 
309 RegExp::MatchResult::MatchResult(MatchResultData* data)
310 	:
311 	fData(data)
312 {
313 }
314 
315 
316 RegExp::MatchResult::MatchResult(const MatchResult& other)
317 	:
318 	fData(other.fData)
319 {
320 	if (fData != NULL)
321 		fData->AcquireReference();
322 }
323 
324 
325 RegExp::MatchResult::~MatchResult()
326 {
327 	if (fData != NULL)
328 		fData->ReleaseReference();
329 }
330 
331 
332 bool
333 RegExp::MatchResult::HasMatched() const
334 {
335 	return fData != NULL && fData->MatchCount() > 0;
336 }
337 
338 
339 size_t
340 RegExp::MatchResult::StartOffset() const
341 {
342 	return fData != NULL && fData->MatchCount() > 0
343 		? fData->Matches()[0].rm_so : 0;
344 }
345 
346 
347 size_t
348 RegExp::MatchResult::EndOffset() const
349 {
350 	return fData != NULL && fData->MatchCount() > 0
351 		? fData->Matches()[0].rm_eo : 0;
352 }
353 
354 
355 size_t
356 RegExp::MatchResult::GroupCount() const
357 {
358 	if (fData == NULL)
359 		return 0;
360 
361 	size_t matchCount = fData->MatchCount();
362 	return matchCount > 0 ? matchCount - 1 : 0;
363 }
364 
365 
366 size_t
367 RegExp::MatchResult::GroupStartOffsetAt(size_t index) const
368 {
369 	return fData != NULL && fData->MatchCount() > index + 1
370 		? fData->Matches()[index + 1].rm_so : 0;
371 }
372 
373 
374 size_t
375 RegExp::MatchResult::GroupEndOffsetAt(size_t index) const
376 {
377 	return fData != NULL && fData->MatchCount() > index + 1
378 		? fData->Matches()[index + 1].rm_eo : 0;
379 }
380 
381 
382 RegExp::MatchResult&
383 RegExp::MatchResult::operator=(const MatchResult& other)
384 {
385 	if (fData != NULL)
386 		fData->ReleaseReference();
387 
388 	fData = other.fData;
389 
390 	if (fData != NULL)
391 		fData->AcquireReference();
392 
393 	return *this;
394 }
395