1(foreign-declare "2 #include <assert.h>3 #include <stdlib.h>4 #include <stdbool.h>5 #include <regex.h>67 #include <sys/types.h>89 regmatch_t *10 make_submatches(size_t n)11 {12 regmatch_t *r;1314 if (!(r = malloc(sizeof(*r) * n)))15 return NULL;1617 return r;18 }1920 void21 submatches_free(regmatch_t *pmatch)22 {23 free(pmatch);24 }2526 regmatch_t *27 submatches_get(size_t nmatch, regmatch_t *pmatch, size_t idx)28 {29 if (idx >= nmatch)30 return NULL;3132 return &pmatch[idx];33 }3435 ssize_t36 submatch_start(regmatch_t *m)37 {38 return (ssize_t)m->rm_so;39 }4041 ssize_t42 submatch_end(regmatch_t *m)43 {44 return (ssize_t)m->rm_eo;45 }4647 regex_t *48 make_regex(int *err, char *pattern, bool igncase, bool ext, bool nl)49 {50 int r;51 int cflags;52 regex_t *re;5354 cflags = 0;55 if (igncase) cflags |= REG_ICASE;56 if (ext) cflags |= REG_EXTENDED;57 if (nl) cflags |= REG_NEWLINE;5859 if (!(re = malloc(sizeof(*re)))) {60 *err = REG_ESPACE;61 return NULL;62 }63 if ((r = regcomp(re, pattern, cflags))) {64 *err = r;65 return NULL;66 }6768 return re;69 }7071 void72 regex_free(regex_t *re)73 {74 regfree(re);75 free(re);76 }7778 size_t79 regex_subexprs(regex_t *re)80 {81 return re->re_nsub;82 }8384 char *85 regex_error(regex_t *re, int err)86 {87 int r;88 char *buf;89 int bufsiz;9091 /* Find out how big a buffer is needed and alloc it. */92 bufsiz = regerror(err, re, (char *)NULL, (size_t)0);93 assert(bufsiz > 0);94 if (!(buf = malloc(bufsiz)))95 return NULL;9697 r = regerror(err, re, buf, bufsiz);98 assert(r <= bufsiz);99 (void)r; /* NDEBUG */100101 return buf;102 }103104 int105 regex_exec(regex_t* re, char *string, size_t nmatch, regmatch_t *pmatch, bool notbol, bool noteol)106 {107 int r;108 int eflags;109110 eflags = 0;111 if (notbol) eflags |= REG_NOTBOL;112 if (noteol) eflags |= REG_NOTEOL;113114 return regexec(re, string, nmatch, pmatch, eflags);115 }116")117118;; Constants from regex.h119(define regex-ok 0)120(define regex-nomatch (foreign-value "REG_NOMATCH" int))121122(cond-expand123 (chicken-5124 ;; Type alias for R7RS bytevectors (not exported by the R7RS egg).125 ;; See: https://bugs.call-cc.org/ticket/1796126 (define-type bytevector u8vector))127 (chicken-6))128129;; Wrapper around the {{regex_t*}} raw C pointer, created to allow130;; utilizing CHICKEN type annotations for {{regex_t*}} values.131132(define-record-type Regex133 (%make-regex ptr)134 regex?135 (ptr regex-pointer))136137;; Convenience type alias138(define-type regex (struct Regex))139140;; Type annotations for Regex type constructor and accessors.141(: %make-regex (pointer -> regex))142(: regex-pointer (regex -> pointer))143144;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;145146;; Wrapper around {{regmatch_t*}} raw C pointer which additionally147;; tracks the amount of allocated submatches (which must not be equal148;; to the amount of matched submatches).149150(define-record-type Submatches151 (%make-submatches ptr count)152 submatches?153 (ptr submatches-pointer)154 (count submatches-count))155156;; Type annotations for Submatches type constructor and accessors.157(: %make-submatches (pointer integer -> (struct Submatches)))158(: submatches-pointer ((struct Submatches) -> pointer))159(: submatches-count ((struct Submatches) -> integer))160161;; Submatch is either a boolean (#f) for a non-matching optional162;; submatch or a pair of bytevector offsets.163(define-type submatch (or false (pair integer integer)))164165;; Convenience type alias for vector of submatches.166(define-type submatch-vector (vector-of submatch))167168;; Allocate memory to store the correct amount of submatches for169;; a given regular expression.170171(: make-submatches (regex -> (struct Submatches)))172(define (make-submatches regex)173 (define %%make-submatches174 (foreign-lambda c-pointer "make_submatches" size_t))175176 (let* ((n (+ (regex-subexprs regex) 1)) ;; reserve space for zero subexpression177 (p (%%make-submatches n)))178 (if p179 (begin180 (set-finalizer! p submatches-free)181 (%make-submatches p n))182 (error "out of memory"))))183184;; Free memory allocated for a raw {{regmatch_t*}} pointer. Invoked185;; automatically via a CHICKEN garbage collector finalizer.186187(: submatches-free (pointer -> undefined))188(define (submatches-free pointer)189 (define %submatches-free190 (foreign-lambda void "submatches_free" nonnull-c-pointer))191192 (%submatches-free pointer))193194;; Retrieve a single submatch by index. The zero index refers to the195;; substring that corresponds to the entire regular expression. As such,196;; actual submatches start at index 1.197198(: submatches-get ((struct Submatches) integer -> pointer))199(define (submatches-get subm idx)200 (define %submatches-get201 (foreign-lambda c-pointer "submatches_get" size_t nonnull-c-pointer size_t))202203 (let* ((ptr (submatches-pointer subm))204 (cnt (submatches-count subm))205 (ret (%submatches-get cnt ptr idx)))206 (if ret207 ret208 (error (string-append "out of bounds submatch: " (number->string idx))))))209210;; Retrieve the start byte offset of a given submatch.211212(: submatch-start (pointer -> integer))213(define (submatch-start match)214 (define %submatch-start215 (foreign-lambda ssize_t "submatch_start" nonnull-c-pointer))216217 (%submatch-start match))218219;; Retrieve the end byte offset of a given submatch.220221(: submatch-end (pointer -> integer))222(define (submatch-end match)223 (define %submatch-end224 (foreign-lambda ssize_t "submatch_end" nonnull-c-pointer))225226 (%submatch-end match))227228;; Convert single submatch to a pair or a boolean (in the case229;; of a non-matching optional submatch).230231(: ->submatch (pointer -> submatch))232(define (pointer->submatch pointer)233 (let ((start (submatch-start pointer))234 (end (submatch-end pointer)))235 (if (and (eqv? start -1) (eqv? end -1))236 #f237 (cons start end))))238239;; Convert encountered submatches to a vector.240241(: submatches->vector ((struct Submatches) -> submatch-vector))242(define (submatches->vector subm)243 (define (%submatches->vector idx vec)244 (if (>= idx (submatches-count subm))245 idx246 (let ((sptr (submatches-get subm idx)))247 (vector-set! vec idx (pointer->submatch sptr))248 (%submatches->vector (+ idx 1) vec))))249250 (let* ((vec (make-vector (submatches-count subm)))251 (matched (%submatches->vector 0 vec)))252 ;; Resize vector to actual amount of matched submatches.253 (vector-copy vec 0 matched)))254255;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;256257;;> Returns a pre-compiled regular expression object for the given258;;> {{pattern}}. The optional arguments {{ignorecase}} and {{extended}}259;;> specify whether the case should be ignored during matching and if ERE260;;> (instead of BRE) syntax should be used. The remaining {{multiline}}261;;> optional argument will cause the string to be treated as multiple262;;> lines (affects handling of {{^}} and {{$}}). If an error occurs263;;> during regex compilation, an exception is raised.264265(: make-regex (string #!optional boolean boolean boolean -> regex))266(define (make-regex pattern #!optional ignorecase extended multiline)267 (define %%make-regex268 (foreign-lambda c-pointer "make_regex" (nonnull-c-pointer int) nonnull-c-string bool bool bool))269270 (let-location ((err integer 0))271 (let ((re (%%make-regex (location err) pattern ignorecase extended multiline)))272 (if re273 (begin274 (set-finalizer! re regex-free)275 (%make-regex re))276 (regex-error re err)))))277278;; Returns amount of subexpressions in given regular expressions.279280(: regex-subexprs (regex -> integer))281(define (regex-subexprs regex)282 (define %regex-subexprs283 (foreign-lambda size_t "regex_subexprs" nonnull-c-pointer))284285 (%regex-subexprs (regex-pointer regex)))286287;; Extracts error condition from given {{regex_t*}} pointer value288;; and associated error code as returned by {{regcomp(3)}}. This289;; procedure always raises an error.290291(: regex-error (pointer integer -> noreturn))292(define (regex-error regex err-code)293 (define %regex-error294 (foreign-lambda c-string* "regex_error" c-pointer int))295296 ;; Due to the c-string* type specifier, CHICKEN will copy memory297 ;; allocated for the error message to a temporary storage and298 ;; free it automatically.299 (let ((err-msg (%regex-error regex err-code)))300 (if err-msg301 (error (string-append "regex error: " err-msg))302 (error "out of memory"))))303304;; Low-Level wrapper around {{regexec(3)}} used internally by both305;; {{regex-exec}} and {{regex-match?}} (see documentation below).306;; Returns {{#t}} if the regex matches, {{#f}} if it doesn't, and raises307;; an error if {{regexec(3)}} failed.308309(: %regex-exec (regex string integer (or false pointer) boolean boolean -> boolean))310(define (%regex-exec regex string submatches-count submatches-ptr notbol noteol)311 (define %%regex-exec312 (foreign-lambda int "regex_exec" nonnull-c-pointer nonnull-c-string313 size_t c-pointer bool bool))314315 (let* ((p (regex-pointer regex))316 (r (%%regex-exec p string submatches-count submatches-ptr notbol noteol)))317 (cond318 ((eqv? r regex-ok) #t)319 ((eqv? r regex-nomatch) #f)320 (else (regex-error p r)))))321322;;> Execute the given {{regex}} on the given {{bytevector}}. Returns323;;> {{#f}} if the match failed, or a vector of matching subexpressions.324;;> In the vector, each element is either {{#f}} (for non-participating325;;> optional submatches) or a pair of bytevector offsets. The first326;;> element in the pair specifies the beginning of the submatch in the327;;> bytevector, the second element specifies the end of the submatch.328;;> The first pair in the vector corresponds to the matched substring329;;> for the entire regular expression.330;;>331;;> The optional {{notbol}} and {{noteol}} procedure arguments control332;;> whether the first/last character of the input should be considered333;;> the start/end of the line.334335(: regex-exec (regex bytevector #!optional boolean boolean -> (or false submatch-vector)))336(define (regex-exec regex bytevector #!optional notbol noteol)337 (let* ((subm (make-submatches regex))338 (scnt (submatches-count subm))339 (sptr (submatches-pointer subm)))340 (if (%regex-exec regex (utf8->string bytevector) scnt sptr notbol noteol)341 (submatches->vector subm)342 #f)))343344;;> Check whether the given {{regex}} is matched by the given345;;> {{string}}. If so, {{#t}} is returned, otherwise {{#f}} is returned.346;;> This procedure is essentially a variant of {{regex-exec}} which347;;> supports strings instead of bytevectors directly and thus doesn't348;;> support submatches. Refer to {{regex-exec}} for documentation on349;;> the optional {{notbol}} and {{noteol}} procedure parameters.350351(: regex-match? (regex string #!optional boolean boolean -> boolean))352(define (regex-match? regex string #!optional notbol noteol)353 (%regex-exec regex string 0 #f notbol noteol))354355;; Frees all resources allocate for a {{regex_t*}} pointer value. Invoked356;; automatically via a CHICKEN garbage collector finalizer.357358(: regex-free (pointer -> undefined))359(define (regex-free ptr)360 (define %regex-free361 (foreign-lambda void "regex_free" nonnull-c-pointer))362363 (%regex-free ptr))