edward - replace.scm

  1;;>| Replacement Parser
  2;;>
  3;;> Edward [parser combinators][edward parse] for parsing replacement strings.
  4;;>
  5;;> [edward parse]: edward.parse.html
  6
  7(define parse-backref
  8  (parse-map
  9    (parse-seq
 10      (parse-char #\\)
 11      parse-digits)
 12    (lambda (lst)
 13      (cons 'backref (cadr lst)))))
 14
 15(define parse-matched
 16  (parse-map
 17    (parse-char #\&)
 18    (lambda (ch)
 19      (cons 'matched ch))))
 20
 21(define (parse-restr delim)
 22  (define replace-ctrl
 23    (char-set-adjoin (char-set #\\ #\& #\newline) delim))
 24
 25  (parse-map
 26    (parse-as-string
 27      (parse-repeat+
 28        (parse-or
 29          ;; special handling for '%' as it does not neccessarily
 30          ;; need to be escaped unless it's the only character.
 31          (parse-esc
 32            (parse-char (char-set-adjoin replace-ctrl #\%)))
 33          (parse-not-char replace-ctrl))))
 34    (lambda (str)
 35      (cons 'restr str))))
 36
 37;;> Parse a replacement string within text enclosed with the delimiter
 38;;> `delim`. While the combinator does not parse the enclosed character,
 39;;> it ensures that this `delim` character is escaped (using a `\`)
 40;;> within the replacement.
 41;;>
 42;;> Refer to the documentation of the [ed substitute][ed substitute]
 43;;> command for more information on special character support within
 44;;> the replacement. All of these special characters can also be
 45;;> escaped.
 46;;>
 47;;> [ed substitute]: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/ed.html#tag_20_38_13_25
 48
 49(define (parse-replace delim)
 50  (parse-map
 51    (parse-repeat
 52      (parse-atomic
 53        (parse-or
 54          parse-backref
 55          parse-matched
 56          (parse-restr delim))))
 57    (lambda (lst)
 58      ;; If the replacement is empty replace matched text with an empty string.
 59      (if (null? lst)
 60        (cons '(restr . "") lst)
 61        lst))))
 62
 63;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 64
 65;;>| Replacement Procedures
 66;;>
 67;;> Procedures for performing replacements using a parsed replacement string.
 68
 69(define (submatch subm bv n)
 70  (if (>= n (vector-length subm))
 71    (string->utf8 (number->string n)) ;; XXX: Handling for invalid backreferences
 72    (let ((match (vector-ref subm n)))
 73      (if match
 74        (bytevector-copy bv (car match) (cdr match))
 75        #u8()))))
 76
 77(define (regex-replace* regex subst bv nth)
 78  (define (apply-replacement subm bv replacement)
 79    (fold (lambda (x y)
 80            (bytevector-append y
 81              (match x
 82                (('restr . s)   (string->utf8 s))
 83                (('matched . _) (submatch subm bv 0))
 84                (('backref . n) (submatch subm bv n)))))
 85          #u8() replacement))
 86
 87  ;; TODO: Refactor this function and make it more readable.
 88  ;; Also don't rely on (values …) truncation (not in R7RS).
 89  (define (%regex-replace* re start n)
 90    (let* ((v (bytevector-copy bv start))
 91           (subm (regex-exec regex v)))
 92      (if subm
 93        (let* ((m (vector-ref subm 0)) ;; submatch for entire regex
 94               (s (car m))             ;; start of submatch
 95               (e (cdr m))             ;; end of submatch
 96
 97               (i (+ start e))         ;; next index in bv
 98               (r (delay (bytevector-append
 99                           (bytevector-copy v 0 s)
100                           (apply-replacement subm v re)))))
101          (values
102            (if (eqv? n nth)
103              (bytevector-append (force r) (bytevector-copy bv i))
104              (bytevector-append
105                (if (zero? nth) (force r) (bytevector-copy v 0 e))
106                (%regex-replace* re i (+ n 1))))
107            #t))
108        (values v #f))))
109
110  (%regex-replace* subst 0 1))
111
112;;> Replace `nth` occurrence of `regex` in `str` with `subst`. If `nth`
113;;> is zero, all occurrences are replaced. Returns two results: The string
114;;> after performing all replacement and a boolean indicating if any
115;;> replacements were successfully performed. The `regex` must be
116;;> created using [make-regex][make-regex], while the replacement string
117;;> `subst` must be parsed using [parse-replace][parse-replace].
118;;>
119;;> [make-regex]: https://wiki.call-cc.org/eggref/5/posix-regex#make-regex
120;;> [parse-replace]: #parse-replace
121
122(define (regex-replace regex subst str nth)
123  ;; regexec(3p) offsets are byte, not character offsets.
124  ;; Thus, the string needs to be converted to a bytevector.
125  (let-values (((result modified) (regex-replace* regex subst (string->utf8 str) nth)))
126    (values (utf8->string result) modified)))