scmdoc - R7RS.hs

  1{-# LANGUAGE OverloadedStrings #-}
  2
  3-- | This module implements parser combinators for the R7RS Scheme
  4-- language standard. The module implements a parser for the formal
  5-- syntax defined in Section 7.1 of the aformentioned standard.
  6module SchemeDoc.Parser.R7RS (scheme, parseFromFile)
  7where
  8
  9import SchemeDoc.Error
 10import SchemeDoc.Parser.Number
 11import SchemeDoc.Parser.Util
 12import SchemeDoc.Types
 13
 14import Control.Exception (throwIO)
 15import Data.Char
 16import qualified Data.Text as T
 17
 18import Text.Parsec.Char (endOfLine)
 19import Text.ParserCombinators.Parsec hiding (parseFromFile, space, spaces, string)
 20import qualified Text.ParserCombinators.Parsec as P
 21
 22-- Intraline whitespace.
 23--
 24--  <intraline whitespace> → <space or tab>
 25--
 26intraSpace :: Parser Char
 27intraSpace = char ' ' <|> char '\t'
 28
 29-- Zero or more intraline whitespaces.
 30intraSpaces :: Parser ()
 31intraSpaces = skipMany intraSpace
 32
 33-- Whitespace.
 34--
 35--  <whitespace> → <intraline whitespace> | <line ending>
 36--
 37-- where
 38--
 39--  <line ending> → <newline> | <return> <newline> | <return>
 40--
 41space :: Parser Char
 42space = intraSpace <|> endOfLine <|> char '\r'
 43
 44-- Zero or more whitespaces.
 45spaces :: Parser ()
 46spaces = skipMany space
 47
 48------------------------------------------------------------------------
 49
 50-- Inline hex escape sequence.
 51--
 52--  <inline hex escape> → \x<hex scalar value>;
 53--
 54inlineHexEsc :: Parser Char
 55inlineHexEsc = between (P.string "\\x") (char ';') (fmap chr hex)
 56
 57-- Explicit sign notation for numeric types.
 58--
 59--  <explicit sign> → + | -
 60--
 61explicitSign :: Parser Char
 62explicitSign = char '+' <|> char '-'
 63
 64-- Initial character for an identifier.
 65--
 66--  <initial> → <letter> | <special initial>
 67--
 68initial :: Parser Char
 69initial = letter <|> specialInitial
 70
 71-- Special initial characters for symbols / identifiers.
 72--
 73--  <special initial> → ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~
 74--
 75-- TODO: R⁷RS allows for a Scheme implementation to support
 76-- additional "extended identifier characters" the set below
 77-- is just the minimal set of extended identifier characters.
 78specialInitial :: Parser Char
 79specialInitial = oneOf "!$%&*+-./:<=>?@^_~"
 80
 81-- A special subsequent symbol / identifier character.
 82--
 83--  <special subsequent> → <explicit sign> | . | @
 84--
 85specialSubsequent :: Parser Char
 86specialSubsequent = explicitSign <|> char '.' <|> char '@'
 87
 88-- A subsequent character for an identifier.
 89--
 90--  <subsequent> → <initial> | <digit> | <special subsequent>
 91--
 92subsequent :: Parser Char
 93subsequent = initial <|> digit <|> specialSubsequent
 94
 95-- Element of a Scheme symbol / identifier.
 96--
 97--  <symbol element> →
 98--      <any character other than <vertical line> or \>
 99--      | <inline hex escape> | <mnemonic escape> | \|
100--
101symbolElement :: Parser Char
102symbolElement =
103    noneOf "\\|"
104        <|> try inlineHexEsc
105        <|> mnemonicEsc
106
107-- Parse a Scheme mnemonic escape character.
108--
109--  <mnemonic escape> → \a | \b | \t | \n | \r
110--
111mnemonicEsc :: Parser Char
112mnemonicEsc =
113    char '\\'
114        >> ( bind "a" '\a' -- alarm
115                <|> bind "b" '\b' -- backspace
116                <|> bind "t" '\t' -- character tabulation
117                <|> bind "n" '\n' -- linefeed
118                <|> bind "r" '\r' -- return
119           )
120
121-- Parse a character in a string, including escape sequences.
122-- Returns Nothing for escaped newlines.
123--
124--  <string element> → <any character other than " or \>
125--      | <mnemonic escape> | \" | \\
126--      | \<intraline whitespace>*<line ending>
127--        <intraline whitespace>*
128--      | <inline hex escape>
129--
130stringElement :: Parser (Maybe Char)
131stringElement =
132    Just <$> noneOf "\"\\"
133        <|> Just <$> try mnemonicEsc
134        <|> try (bind "\\\"" (Just '"'))
135        <|> try (bind "\\\\" (Just '\\'))
136        <|> (Nothing <$ try (char '\\' >> intraSpaces >> char '\n' >> intraSpaces))
137        <|> Just <$> inlineHexEsc
138
139-- TODO: Add something along the lines of `choice-try` to clean this up.
140-- See the existing `choice` and `try` combinators provided by Parsec.
141
142-- Scheme character name.
143--
144--  <character name> → alarm | backspace | delete
145--      | escape | newline | null | return | space | tab
146--
147characterName :: Parser Char
148characterName =
149    bind "alarm" '\a'
150        <|> bind "backspace" '\b'
151        <|> bind "delete" '\DEL'
152        <|> bind "escape" '\ESC'
153        <|> bind "newline" '\n'
154        <|> bind "null" '\0'
155        <|> bind "return" '\r'
156        <|> bind "space" ' '
157        <|> bind "tab" '\t'
158
159-- Parse a Scheme delimiter.
160--
161--  <delimiter> → <whitespace> | <vertical line> | ( | ) | " | ;
162--
163-- Does not consume any input on failure.
164delimiter :: Parser ()
165delimiter = skip $ space <|> char '|' <|> char '(' <|> char ')' <|> char '"' <|> char ';'
166
167-- Scheme block comment.
168--
169--  <nested comment> → #| <comment text>
170--                      <comment cont>* |#
171--
172-- XXX: Not sure if I understood the nested comment grammar rule correctly.
173nestedComment :: Parser String
174nestedComment = P.string "#|" >> manyTill anyChar (P.string "|#")
175
176-- Source code comment.
177--
178--  <comment> → ; <all subsequent characters up to a line ending>
179--      | <nested comment>
180--      | #; <intertoken space> <datum>
181--
182-- This parser intentionally ignores documentation comments.
183-- See the docComment parser below.
184comment :: Parser ()
185comment =
186    skip $
187        (char ';' >> notFollowedBy (P.string ";>") >> manyTill anyChar endOfLine)
188            <|> nestedComment
189
190-- TODO: #; comments
191
192------------------------------------------------------------------------
193
194-- A source code comment for documentation purposes.
195-- This is a custom SchemeDoc specific grammar rule.
196--
197--  <doc comment> → ;;> <all subsequent characters up to a line ending>
198--
199docComment :: Parser Sexp
200docComment =
201    -- Special case for section comments which must be followed by a
202    -- normal documentation comment and are thus one-line comments.
203    --
204    -- TODO: Maybe useful to have separate constructors for these.
205    (DocComment <$> (try (P.string ";;>|") >> (T.cons '|' <$> docChars)))
206        <|> (DocComment . T.concat <$> many1 (try (intraSpaces >> P.string ";;>") >> docChars))
207  where
208    docChars :: Parser T.Text
209    docChars = T.pack <$> manyTill' anyChar endOfLine
210
211-- Sign subsequent for peculiar identifier.
212--
213--  <sign subsequent> → <initial> | <explicit sign> | @
214--
215signSubsequent :: Parser String
216signSubsequent = do
217    first <- initial
218    sign <- explicitSign
219    ch <- char '@'
220    return [first, sign, ch]
221
222-- Dot subsequent for peculiar identifier.
223--
224--   | . <dot subsequent> <subsequent>*
225--
226dotSubsequent :: Parser String
227dotSubsequent = do
228    first <- char '.'
229    subsq <- many subsequent
230    return (first : subsq)
231
232-- Peculiar identifier.
233--
234--  <peculiar identifier> → <explicit sign>
235--      | <explicit sign> <sign subsequent> <subsequent>*
236--      | <explicit sign> . <dot subsequent> <subsequent>*
237--      | . <dot subsequent> <subsequent>*
238--
239peculiarIdentifier :: Parser String
240peculiarIdentifier =
241    fmap (: []) (try explicitSign)
242        <|> try
243            ( do
244                sign <- explicitSign
245                ssub <- signSubsequent
246                sub <- many subsequent
247                return ([sign] ++ ssub ++ sub)
248            )
249        <|> try
250            ( do
251                sign <- explicitSign
252                dsub <- dotSubsequent
253                sub <- many subsequent
254                return ([sign] ++ dsub ++ sub)
255            )
256        <|> try
257            ( do
258                dot <- char '.'
259                dsub <- dotSubsequent
260                sub <- many subsequent
261                return ([dot] ++ dsub ++ sub)
262            )
263
264-- Scheme identifier or symbol.
265--
266--  <identifier> → <initial> <subsequent>*
267--      | <vertical line> <symbol element>* <vertical line>
268--      | <peculiar identifier>
269--
270identifier :: Parser Sexp
271identifier =
272    fmap (Id . T.pack) $
273        (initial >>= (\i -> (:) i <$> many subsequent))
274            <|> between (char '|') (char '|') (many symbolElement)
275            <|> peculiarIdentifier
276
277-- Scheme boolean
278--
279--  <boolean> → #t | #f | #true | #false
280--
281boolean :: Parser Sexp
282boolean =
283    char '#'
284        >> ( Boolean True <$ (try (P.string "true") <|> P.string "t")
285                <|> Boolean False <$ (try (P.string "false") <|> P.string "f")
286           )
287
288-- Scheme character.
289--
290--  <character> → #\ <any character>
291--      | #\ <character name>
292--      | #\x<hex scalar value>
293--
294character :: Parser Sexp
295character =
296    fmap Char $
297        try (P.string "#\\" >> characterName)
298            <|> try (P.string "#\\x" >> fmap chr hex)
299            <|> try (P.string "#\\" >> anyChar)
300
301-- A Scheme String.
302--
303--  <string> → " <string element>* "
304--
305string :: Parser Sexp
306string =
307    Str . T.pack
308        <$> between (char '"') (char '"') (filterJust <$> many stringElement)
309
310-- Parse a list, e.g. (1 2 3).
311list :: Parser Sexp
312list =
313    List
314        <$> between (lexeme $ char '(') (char ')') (many sexp)
315
316-- Parse syntatic sugar for vectors, e.g. `#(1 2 3)`.
317vector :: Parser Sexp
318vector =
319    (\lst -> List $ Id "vector" : lst)
320        <$> between (lexeme $ P.string "#(") (char ')') (many sexp)
321
322-- Parse syntatic sugar for bytevectors, e.g. `#u8(1 2 3)`.
323bytevector :: Parser Sexp
324bytevector =
325    (\lst -> List $ Id "bytevector" : lst)
326        <$> between (lexeme $ P.string "#u8(") (char ')') (many sexp)
327
328-- -- Parse syntatic sugor for quotations, e.g. `'foo`.
329quote :: Parser Sexp
330quote =
331    fmap (\datum -> List [Id "quote", datum]) $
332        lexeme (char '\'') >> sexp'
333
334-- Parse syntatic sugar for quasiquotations, e.g. ``foo`.
335quasiquotation :: Parser Sexp
336quasiquotation =
337    fmap (\e -> List [Id "quasiquote", e]) $
338        lexeme (try $ char '`') >> sexp'
339
340-- Parse syntatic sugor for unquote, e.g. `,foo`.
341unquote :: Parser Sexp
342unquote =
343    fmap (\e -> List [Id "unquote", e]) $
344        lexeme (char ',') >> sexp'
345
346-- Parse syntatic sugar for unquote-splicing, e.g. `,@foo`.
347unquoteSplicing :: Parser Sexp
348unquoteSplicing =
349    fmap (\e -> List [Id "unquote-splicing", e]) $
350        lexeme (P.string ",@") >> sexp'
351
352-- Parse an S-Expression without lexing or delimiter handling
353-- according to the tokens defined in the R⁷RS formal syntax:
354--
355--  <token> → <identifier> | <boolean> | <number>
356--      | <character> | <string>
357--      | ( | ) | #( | #u8( | ’ | ` | , | ,@ | .
358--
359sexp' :: Parser Sexp
360sexp' =
361    identifier
362        <|> character
363        <|> number
364        <|> string
365        <|> list
366        <|> docComment
367        -- Boolean, comments, and {bit,}vectors all start with
368        -- a `#` character and thus require backtracking.
369        <|> try boolean
370        <|> try vector
371        <|> bytevector
372        -- XXX: Quotation tokens are ignored for now
373        <|> quote
374        <|> quasiquotation
375        <|> try unquoteSplicing
376        <|> unquote
377
378-- TODO: Directive (#!fold-case, …)
379
380-- Parse an s-expression with lexing and delimiter checking.
381sexp :: Parser Sexp
382sexp = lexeme $ terminatedBy sexp' (lookAhead (delim <|> eof))
383  where
384    delim :: Parser ()
385    delim = (delimiter <|> skip (P.string "#|") <|> eof) <?> "delimiter"
386
387-- “Comments are treated exactly like whitespace.”
388-- Comments require backtracking for docComment parser.
389lexComment :: Parser ()
390lexComment = skipMany (try comment >> spaces)
391
392-- Strip whitespaces and comments between tokens.
393lexeme :: Parser a -> Parser a
394lexeme p = do
395    r <- p
396    _ <- spaces
397    _ <- lexComment
398    return r
399
400-- | Parse a Scheme program as defined in the [R7RS-small](https://small.r7rs.org/) standard.
401scheme :: Parser [Sexp]
402scheme = do
403    _ <- spaces
404    _ <- lexComment
405    manyTill sexp eof
406
407-- TODO: Don't dupilcate lexeme here
408
409------------------------------------------------------------------------
410
411-- | Utility function for parsing a Scheme source code file.
412-- Behaves like 'Text.Parsec.String.parseFromFile' but unconditionally
413-- uses the 'scheme' parser and throws an 'ErrParser' exception on error.
414parseFromFile :: FilePath -> IO [Sexp]
415parseFromFile fileName = do
416    r <- P.parseFromFile scheme fileName
417    case r of
418        Left err -> throwIO $ ErrParser err
419        Right s -> pure s