posix-regex

CHICKEN Scheme wrapper for POSIX regular expression matching

git clone https://git.8pit.net/posix-regex.git

  1(foreign-declare "
  2  #include <assert.h>
  3  #include <stdlib.h>
  4  #include <stdbool.h>
  5  #include <regex.h>
  6
  7  #include <sys/types.h>
  8
  9  regmatch_t *
 10  make_submatches(size_t n)
 11  {
 12    regmatch_t *r;
 13
 14    if (!(r = malloc(sizeof(*r) * n)))
 15      return NULL;
 16
 17    return r;
 18  }
 19
 20  void
 21  submatches_free(regmatch_t *pmatch)
 22  {
 23    free(pmatch);
 24  }
 25
 26  regmatch_t *
 27  submatches_get(size_t nmatch, regmatch_t *pmatch, size_t idx)
 28  {
 29    if (idx >= nmatch)
 30      return NULL;
 31
 32    return &pmatch[idx];
 33  }
 34
 35  ssize_t
 36  submatch_start(regmatch_t *m)
 37  {
 38    return (ssize_t)m->rm_so;
 39  }
 40
 41  ssize_t
 42  submatch_end(regmatch_t *m)
 43  {
 44    return (ssize_t)m->rm_eo;
 45  }
 46
 47  regex_t *
 48  make_regex(int *err, char *pattern, bool igncase, bool ext, bool nl)
 49  {
 50    int r;
 51    int cflags;
 52    regex_t *re;
 53
 54    cflags = 0;
 55    if (igncase) cflags |= REG_ICASE;
 56    if (ext)     cflags |= REG_EXTENDED;
 57    if (nl)      cflags |= REG_NEWLINE;
 58
 59    if (!(re = malloc(sizeof(*re)))) {
 60      *err = REG_ESPACE;
 61      return NULL;
 62    }
 63    if ((r = regcomp(re, pattern, cflags))) {
 64      *err = r;
 65      return NULL;
 66    }
 67
 68    return re;
 69  }
 70
 71  void
 72  regex_free(regex_t *re)
 73  {
 74    regfree(re);
 75    free(re);
 76  }
 77
 78  size_t
 79  regex_subexprs(regex_t *re)
 80  {
 81    return re->re_nsub;
 82  }
 83
 84  char *
 85  regex_error(regex_t *re, int err)
 86  {
 87    int r;
 88    char *buf;
 89    int bufsiz;
 90
 91    /* Find out how big a buffer is needed and alloc it. */
 92    bufsiz = regerror(err, re, (char *)NULL, (size_t)0);
 93    assert(bufsiz > 0);
 94    if (!(buf = malloc(bufsiz)))
 95      return NULL;
 96
 97    r = regerror(err, re, buf, bufsiz);
 98    assert(r <= bufsiz);
 99    (void)r; /* NDEBUG */
100
101    return buf;
102  }
103
104  int
105  regex_exec(regex_t* re, char *string, size_t nmatch, regmatch_t *pmatch, bool notbol, bool noteol)
106  {
107    int r;
108    int eflags;
109
110    eflags = 0;
111    if (notbol) eflags |= REG_NOTBOL;
112    if (noteol) eflags |= REG_NOTEOL;
113
114    return regexec(re, string, nmatch, pmatch, eflags);
115  }
116")
117
118;; Constants from regex.h
119(define regex-ok 0)
120(define regex-nomatch (foreign-value "REG_NOMATCH" int))
121
122(cond-expand
123  (chicken-5
124    ;; Type alias for R7RS bytevectors (not exported by the R7RS egg).
125    ;; See: https://bugs.call-cc.org/ticket/1796
126    (define-type bytevector u8vector))
127  (chicken-6))
128
129;; Wrapper around the {{regex_t*}} raw C pointer, created to allow
130;; utilizing CHICKEN type annotations for {{regex_t*}} values.
131
132(define-record-type Regex
133  (%make-regex ptr)
134  regex?
135  (ptr regex-pointer))
136
137;; Convenience type alias
138(define-type regex (struct Regex))
139
140;; Type annotations for Regex type constructor and accessors.
141(: %make-regex (pointer -> regex))
142(: regex-pointer (regex -> pointer))
143
144;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
145
146;; Wrapper around {{regmatch_t*}} raw C pointer which additionally
147;; tracks the amount of allocated submatches (which must not be equal
148;; to the amount of matched submatches).
149
150(define-record-type Submatches
151  (%make-submatches ptr count)
152  submatches?
153  (ptr submatches-pointer)
154  (count submatches-count))
155
156;; Type annotations for Submatches type constructor and accessors.
157(: %make-submatches (pointer integer -> (struct Submatches)))
158(: submatches-pointer ((struct Submatches) -> pointer))
159(: submatches-count ((struct Submatches) -> integer))
160
161;; Submatch is either a boolean (#f) for a non-matching optional
162;; submatch or a pair of bytevector offsets.
163(define-type submatch (or false (pair integer integer)))
164
165;; Convenience type alias for vector of submatches.
166(define-type submatch-vector (vector-of submatch))
167
168;; Allocate memory to store the correct amount of submatches for
169;; a given regular expression.
170
171(: make-submatches (regex -> (struct Submatches)))
172(define (make-submatches regex)
173  (define %%make-submatches
174    (foreign-lambda c-pointer "make_submatches" size_t))
175
176  (let* ((n (+ (regex-subexprs regex) 1)) ;; reserve space for zero subexpression
177         (p (%%make-submatches n)))
178    (if p
179        (begin
180          (set-finalizer! p submatches-free)
181          (%make-submatches p n))
182      (error "out of memory"))))
183
184;; Free memory allocated for a raw {{regmatch_t*}} pointer. Invoked
185;; automatically via a CHICKEN garbage collector finalizer.
186
187(: submatches-free (pointer -> undefined))
188(define (submatches-free pointer)
189  (define %submatches-free
190    (foreign-lambda void "submatches_free" nonnull-c-pointer))
191
192  (%submatches-free pointer))
193
194;; Retrieve a single submatch by index. The zero index refers to the
195;; substring that corresponds to the entire regular expression. As such,
196;; actual submatches start at index 1.
197
198(: submatches-get ((struct Submatches) integer -> pointer))
199(define (submatches-get subm idx)
200  (define %submatches-get
201    (foreign-lambda c-pointer "submatches_get" size_t nonnull-c-pointer size_t))
202
203  (let* ((ptr (submatches-pointer subm))
204         (cnt (submatches-count subm))
205         (ret (%submatches-get cnt ptr idx)))
206    (if ret
207      ret
208      (error (string-append "out of bounds submatch: " (number->string idx))))))
209
210;; Retrieve the start byte offset of a given submatch.
211
212(: submatch-start (pointer -> integer))
213(define (submatch-start match)
214  (define %submatch-start
215    (foreign-lambda ssize_t "submatch_start" nonnull-c-pointer))
216
217  (%submatch-start match))
218
219;; Retrieve the end byte offset of a given submatch.
220
221(: submatch-end (pointer -> integer))
222(define (submatch-end match)
223  (define %submatch-end
224    (foreign-lambda ssize_t "submatch_end" nonnull-c-pointer))
225
226  (%submatch-end match))
227
228;; Convert single submatch to a pair or a boolean (in the case
229;; of a non-matching optional submatch).
230
231(: ->submatch (pointer -> submatch))
232(define (pointer->submatch pointer)
233  (let ((start (submatch-start pointer))
234        (end   (submatch-end pointer)))
235    (if (and (eqv? start -1) (eqv? end -1))
236      #f
237      (cons start end))))
238
239;; Convert encountered submatches to a vector.
240
241(: submatches->vector ((struct Submatches) -> submatch-vector))
242(define (submatches->vector subm)
243  (define (%submatches->vector idx vec)
244    (if (>= idx (submatches-count subm))
245      idx
246      (let ((sptr (submatches-get subm idx)))
247        (vector-set! vec idx (pointer->submatch sptr))
248        (%submatches->vector (+ idx 1) vec))))
249
250  (let* ((vec (make-vector (submatches-count subm)))
251         (matched (%submatches->vector 0 vec)))
252    ;; Resize vector to actual amount of matched submatches.
253    (vector-copy vec 0 matched)))
254
255;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
256
257;;> Returns a pre-compiled regular expression object for the given
258;;> {{pattern}}. The optional arguments {{ignorecase}} and {{extended}}
259;;> specify whether the case should be ignored during matching and if ERE
260;;> (instead of BRE) syntax should be used. The remaining {{multiline}}
261;;> optional argument will cause the string to be treated as multiple
262;;> lines (affects handling of {{^}} and {{$}}). If an error occurs
263;;> during regex compilation, an exception is raised.
264
265(: make-regex (string #!optional boolean boolean boolean -> regex))
266(define (make-regex pattern #!optional ignorecase extended multiline)
267  (define %%make-regex
268    (foreign-lambda c-pointer "make_regex" (nonnull-c-pointer int) nonnull-c-string bool bool bool))
269
270  (let-location ((err integer 0))
271    (let ((re (%%make-regex (location err) pattern ignorecase extended multiline)))
272      (if re
273        (begin
274          (set-finalizer! re regex-free)
275          (%make-regex re))
276        (regex-error re err)))))
277
278;; Returns amount of subexpressions in given regular expressions.
279
280(: regex-subexprs (regex -> integer))
281(define (regex-subexprs regex)
282  (define %regex-subexprs
283    (foreign-lambda size_t "regex_subexprs" nonnull-c-pointer))
284
285  (%regex-subexprs (regex-pointer regex)))
286
287;; Extracts error condition from given {{regex_t*}} pointer value
288;; and associated error code as returned by {{regcomp(3)}}. This
289;; procedure always raises an error.
290
291(: regex-error (pointer integer -> noreturn))
292(define (regex-error regex err-code)
293  (define %regex-error
294    (foreign-lambda c-string* "regex_error" c-pointer int))
295
296  ;; Due to the c-string* type specifier, CHICKEN will copy memory
297  ;; allocated for the error message to a temporary storage and
298  ;; free it automatically.
299  (let ((err-msg (%regex-error regex err-code)))
300    (if err-msg
301      (error (string-append "regex error: " err-msg))
302      (error "out of memory"))))
303
304;; Low-Level wrapper around {{regexec(3)}} used internally by both
305;; {{regex-exec}} and {{regex-match?}} (see documentation below).
306;; Returns {{#t}} if the regex matches, {{#f}} if it doesn't, and raises
307;; an error if {{regexec(3)}} failed.
308
309(: %regex-exec (regex string integer (or false pointer) boolean boolean -> boolean))
310(define (%regex-exec regex string submatches-count submatches-ptr notbol noteol)
311  (define %%regex-exec
312    (foreign-lambda int "regex_exec" nonnull-c-pointer nonnull-c-string
313                                     size_t c-pointer bool bool))
314
315  (let* ((p (regex-pointer regex))
316         (r (%%regex-exec p string submatches-count submatches-ptr notbol noteol)))
317    (cond
318      ((eqv? r regex-ok) #t)
319      ((eqv? r regex-nomatch) #f)
320      (else (regex-error p r)))))
321
322;;> Execute the given {{regex}} on the given {{bytevector}}. Returns
323;;> {{#f}} if the match failed, or a vector of matching subexpressions.
324;;> In the vector, each element is either {{#f}} (for non-participating
325;;> optional submatches) or a pair of bytevector offsets. The first
326;;> element in the pair specifies the beginning of the submatch in the
327;;> bytevector, the second element specifies the end of the submatch.
328;;> The first pair in the vector corresponds to the matched substring
329;;> for the entire regular expression.
330;;>
331;;> The optional {{notbol}} and {{noteol}} procedure arguments control
332;;> whether the first/last character of the input should be considered
333;;> the start/end of the line.
334
335(: regex-exec (regex bytevector #!optional boolean boolean -> (or false submatch-vector)))
336(define (regex-exec regex bytevector #!optional notbol noteol)
337  (let* ((subm (make-submatches regex))
338         (scnt (submatches-count subm))
339         (sptr (submatches-pointer subm)))
340    (if (%regex-exec regex (utf8->string bytevector) scnt sptr notbol noteol)
341      (submatches->vector subm)
342      #f)))
343
344;;> Check whether the given {{regex}} is matched by the given
345;;> {{string}}. If so, {{#t}} is returned, otherwise {{#f}} is returned.
346;;> This procedure is essentially a variant of {{regex-exec}} which
347;;> supports strings instead of bytevectors directly and thus doesn't
348;;> support submatches. Refer to {{regex-exec}} for documentation on
349;;> the optional {{notbol}} and {{noteol}} procedure parameters.
350
351(: regex-match? (regex string #!optional boolean boolean -> boolean))
352(define (regex-match? regex string #!optional notbol noteol)
353  (%regex-exec regex string 0 #f notbol noteol))
354
355;; Frees all resources allocate for a {{regex_t*}} pointer value. Invoked
356;; automatically via a CHICKEN garbage collector finalizer.
357
358(: regex-free (pointer -> undefined))
359(define (regex-free ptr)
360  (define %regex-free
361    (foreign-lambda void "regex_free" nonnull-c-pointer))
362
363  (%regex-free ptr))