posix-regex

CHICKEN Scheme wrapper for POSIX regular expression matching

git clone https://git.8pit.net/posix-regex.git

  1(foreign-declare "
  2  #include <assert.h>
  3  #include <stdlib.h>
  4  #include <stdbool.h>
  5  #include <regex.h>
  6
  7  #include <sys/types.h>
  8
  9  regmatch_t *
 10  make_submatches(size_t n)
 11  {
 12    regmatch_t *r;
 13
 14    if (!(r = malloc(sizeof(*r) * n)))
 15      return NULL;
 16
 17    return r;
 18  }
 19
 20  void
 21  submatches_free(regmatch_t *pmatch)
 22  {
 23    free(pmatch);
 24  }
 25
 26  regmatch_t *
 27  submatches_get(size_t nmatch, regmatch_t *pmatch, size_t idx)
 28  {
 29    if (idx >= nmatch)
 30      return NULL;
 31
 32    return &pmatch[idx];
 33  }
 34
 35  ssize_t
 36  submatch_start(regmatch_t *m)
 37  {
 38    return (ssize_t)m->rm_so;
 39  }
 40
 41  ssize_t
 42  submatch_end(regmatch_t *m)
 43  {
 44    return (ssize_t)m->rm_eo;
 45  }
 46
 47  regex_t *
 48  make_regex(int *err, char *pattern, bool igncase, bool ext, bool nl)
 49  {
 50    int r;
 51    int cflags;
 52    regex_t *re;
 53
 54    cflags = 0;
 55    if (igncase) cflags |= REG_ICASE;
 56    if (ext)     cflags |= REG_EXTENDED;
 57    if (nl)      cflags |= REG_NEWLINE;
 58
 59    if (!(re = malloc(sizeof(*re)))) {
 60      *err = REG_ESPACE;
 61      return NULL;
 62    }
 63    if ((r = regcomp(re, pattern, cflags))) {
 64      *err = r;
 65      return NULL;
 66    }
 67
 68    return re;
 69  }
 70
 71  void
 72  regex_free(regex_t *re)
 73  {
 74    regfree(re);
 75    free(re);
 76  }
 77
 78  size_t
 79  regex_subexprs(regex_t *re)
 80  {
 81    return re->re_nsub;
 82  }
 83
 84  char *
 85  regex_error(regex_t *re, int err)
 86  {
 87    int r;
 88    char *buf;
 89    int bufsiz;
 90
 91    /* Find out how big a buffer is needed and alloc it. */
 92    bufsiz = regerror(err, re, (char *)NULL, (size_t)0);
 93    assert(bufsiz > 0);
 94    if (!(buf = malloc(bufsiz)))
 95      return NULL;
 96
 97    r = regerror(err, re, buf, bufsiz);
 98    assert(r <= bufsiz);
 99    (void)r; /* NDEBUG */
100
101    return buf;
102  }
103
104  int
105  regex_exec(regex_t* re, char *string, size_t nmatch, regmatch_t *pmatch, bool notbol, bool noteol)
106  {
107    int r;
108    int eflags;
109
110    eflags = 0;
111    if (notbol) eflags |= REG_NOTBOL;
112    if (noteol) eflags |= REG_NOTEOL;
113
114    return regexec(re, string, nmatch, pmatch, eflags);
115  }
116")
117
118;; Constants from regex.h
119(define regex-ok 0)
120(define regex-nomatch (foreign-value "REG_NOMATCH" int))
121
122;; Type alias for R7RS bytevectors (not exported by the R7RS egg).
123;; See: https://bugs.call-cc.org/ticket/1796
124(define-type bytevector u8vector)
125
126;; Wrapper around the {{regex_t*}} raw C pointer, created to allow
127;; utilizing CHICKEN type annotations for {{regex_t*}} values.
128
129(define-record-type Regex
130  (%make-regex ptr)
131  regex?
132  (ptr regex-pointer))
133
134;; Convenience type alias
135(define-type regex (struct Regex))
136
137;; Type annotations for Regex type constructor and accessors.
138(: %make-regex (pointer -> regex))
139(: regex-pointer (regex -> pointer))
140
141;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
142
143;; Wrapper around {{regmatch_t*}} raw C pointer which additionally
144;; tracks the amount of allocated submatches (which must not be equal
145;; to the amount of matched submatches).
146
147(define-record-type Submatches
148  (%make-submatches ptr count)
149  submatches?
150  (ptr submatches-pointer)
151  (count submatches-count))
152
153;; Type annotations for Submatches type constructor and accessors.
154(: %make-submatches (pointer integer -> (struct Submatches)))
155(: submatches-pointer ((struct Submatches) -> pointer))
156(: submatches-count ((struct Submatches) -> integer))
157
158;; Submatch is either a boolean (#f) for a non-matching optional
159;; submatch or a pair of bytevector offsets.
160(define-type submatch (or false (pair integer integer)))
161
162;; Convenience type alias for vector of submatches.
163(define-type submatch-vector (vector-of submatch))
164
165;; Allocate memory to store the correct amount of submatches for
166;; a given regular expression.
167
168(: make-submatches (regex -> (struct Submatches)))
169(define (make-submatches regex)
170  (define %%make-submatches
171    (foreign-lambda c-pointer "make_submatches" size_t))
172
173  (let* ((n (+ (regex-subexprs regex) 1)) ;; reserve space for zero subexpression
174         (p (%%make-submatches n)))
175    (if p
176        (begin
177          (set-finalizer! p submatches-free)
178          (%make-submatches p n))
179      (error "out of memory"))))
180
181;; Free memory allocated for a raw {{regmatch_t*}} pointer. Invoked
182;; automatically via a CHICKEN garbage collector finalizer.
183
184(: submatches-free (pointer -> undefined))
185(define (submatches-free pointer)
186  (define %submatches-free
187    (foreign-lambda void "submatches_free" nonnull-c-pointer))
188
189  (%submatches-free pointer))
190
191;; Retrieve a single submatch by index. The zero index refers to the
192;; substring that corresponds to the entire regular expression. As such,
193;; actual submatches start at index 1.
194
195(: submatches-get ((struct Submatches) integer -> pointer))
196(define (submatches-get subm idx)
197  (define %submatches-get
198    (foreign-lambda c-pointer "submatches_get" size_t nonnull-c-pointer size_t))
199
200  (let* ((ptr (submatches-pointer subm))
201         (cnt (submatches-count subm))
202         (ret (%submatches-get cnt ptr idx)))
203    (if ret
204      ret
205      (error (string-append "out of bounds submatch: " (number->string idx))))))
206
207;; Retrieve the start byte offset of a given submatch.
208
209(: submatch-start (pointer -> integer))
210(define (submatch-start match)
211  (define %submatch-start
212    (foreign-lambda ssize_t "submatch_start" nonnull-c-pointer))
213
214  (%submatch-start match))
215
216;; Retrieve the end byte offset of a given submatch.
217
218(: submatch-end (pointer -> integer))
219(define (submatch-end match)
220  (define %submatch-end
221    (foreign-lambda ssize_t "submatch_end" nonnull-c-pointer))
222
223  (%submatch-end match))
224
225;; Convert single submatch to a pair or a boolean (in the case
226;; of a non-matching optional submatch).
227
228(: ->submatch (pointer -> submatch))
229(define (pointer->submatch pointer)
230  (let ((start (submatch-start pointer))
231        (end   (submatch-end pointer)))
232    (if (and (eqv? start -1) (eqv? end -1))
233      #f
234      (cons start end))))
235
236;; Convert encountered submatches to a vector.
237
238(: submatches->vector ((struct Submatches) -> submatch-vector))
239(define (submatches->vector subm)
240  (define (%submatches->vector idx vec)
241    (if (>= idx (submatches-count subm))
242      idx
243      (let ((sptr (submatches-get subm idx)))
244        (vector-set! vec idx (pointer->submatch sptr))
245        (%submatches->vector (+ idx 1) vec))))
246
247  (let* ((vec (make-vector (submatches-count subm)))
248         (matched (%submatches->vector 0 vec)))
249    ;; Resize vector to actual amount of matched submatches.
250    (vector-copy vec 0 matched)))
251
252;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
253
254;;> Returns a pre-compiled regular expression object for the given
255;;> {{pattern}}. The optional arguments {{ignorecase}} and {{extended}}
256;;> specify whether the case should be ignored during matching and if ERE
257;;> (instead of BRE) syntax should be used. The remaining {{multiline}}
258;;> optional argument will cause the string to be treated as multiple
259;;> lines (affects handling of {{^}} and {{$}}). If an error occurs
260;;> during regex compilation, an exception is raised.
261
262(: make-regex (string #!optional boolean boolean boolean -> regex))
263(define (make-regex pattern #!optional ignorecase extended multiline)
264  (define %%make-regex
265    (foreign-lambda c-pointer "make_regex" (nonnull-c-pointer int) nonnull-c-string bool bool bool))
266
267  (let-location ((err integer 0))
268    (let ((re (%%make-regex (location err) pattern ignorecase extended multiline)))
269      (if re
270        (begin
271          (set-finalizer! re regex-free)
272          (%make-regex re))
273        (regex-error re err)))))
274
275;; Returns amount of subexpressions in given regular expressions.
276
277(: regex-subexprs (regex -> integer))
278(define (regex-subexprs regex)
279  (define %regex-subexprs
280    (foreign-lambda size_t "regex_subexprs" nonnull-c-pointer))
281
282  (%regex-subexprs (regex-pointer regex)))
283
284;; Extracts error condition from given {{regex_t*}} pointer value
285;; and associated error code as returned by {{regcomp(3)}}. This
286;; procedure always raises an error.
287
288(: regex-error (pointer integer -> noreturn))
289(define (regex-error regex err-code)
290  (define %regex-error
291    (foreign-lambda c-string* "regex_error" c-pointer int))
292
293  ;; Due to the c-string* type specifier, CHICKEN will copy memory
294  ;; allocated for the error message to a temporary storage and
295  ;; free it automatically.
296  (let ((err-msg (%regex-error regex err-code)))
297    (if err-msg
298      (error (string-append "regex error: " err-msg))
299      (error "out of memory"))))
300
301;; Low-Level wrapper around {{regexec(3)}} used internally by both
302;; {{regex-exec}} and {{regex-match?}} (see documentation below).
303;; Returns {{#t}} if the regex matches, {{#f}} if it doesn't, and raises
304;; an error if {{regexec(3)}} failed.
305
306(: %regex-exec (regex string integer (or false pointer) boolean boolean -> boolean))
307(define (%regex-exec regex string submatches-count submatches-ptr notbol noteol)
308  (define %%regex-exec
309    (foreign-lambda int "regex_exec" nonnull-c-pointer nonnull-c-string
310                                     size_t c-pointer bool bool))
311
312  (let* ((p (regex-pointer regex))
313         (r (%%regex-exec p string submatches-count submatches-ptr notbol noteol)))
314    (cond
315      ((eqv? r regex-ok) #t)
316      ((eqv? r regex-nomatch) #f)
317      (else (regex-error p r)))))
318
319;;> Execute the given {{regex}} on the given {{bytevector}}. Returns
320;;> {{#f}} if the match failed, or a vector of matching subexpressions.
321;;> In the vector, each element is either {{#f}} (for non-participating
322;;> optional submatches) or a pair of bytevector offsets. The first
323;;> element in the pair specifies the beginning of the submatch in the
324;;> bytevector, the second element specifies the end of the submatch.
325;;> The first pair in the vector corresponds to the matched substring
326;;> for the entire regular expression.
327;;>
328;;> The optional {{notbol}} and {{noteol}} procedure arguments control
329;;> whether the first/last character of the input should be considered
330;;> the start/end of the line.
331
332(: regex-exec (regex bytevector #!optional boolean boolean -> (or false submatch-vector)))
333(define (regex-exec regex bytevector #!optional notbol noteol)
334  (let* ((subm (make-submatches regex))
335         (scnt (submatches-count subm))
336         (sptr (submatches-pointer subm)))
337    (if (%regex-exec regex (utf8->string bytevector) scnt sptr notbol noteol)
338      (submatches->vector subm)
339      #f)))
340
341;;> Check whether the given {{regex}} is matched by the given
342;;> {{string}}. If so, {{#t}} is returned, otherwise {{#f}} is returned.
343;;> This procedure is essentially a variant of {{regex-exec}} which
344;;> supports strings instead of bytevectors directly and thus doesn't
345;;> support submatches. Refer to {{regex-exec}} for documentation on
346;;> the optional {{notbol}} and {{noteol}} procedure parameters.
347
348(: regex-match? (regex string #!optional boolean boolean -> boolean))
349(define (regex-match? regex string #!optional notbol noteol)
350  (%regex-exec regex string 0 #f notbol noteol))
351
352;; Frees all resources allocate for a {{regex_t*}} pointer value. Invoked
353;; automatically via a CHICKEN garbage collector finalizer.
354
355(: regex-free (pointer -> undefined))
356(define (regex-free ptr)
357  (define %regex-free
358    (foreign-lambda void "regex_free" nonnull-c-pointer))
359
360  (%regex-free ptr))