1{-# LANGUAGE OverloadedStrings #-}
2
3-- | This module implements parser combinators for the R7RS Scheme
4-- language standard. The module implements a parser for the formal
5-- syntax defined in Section 7.1 of the aformentioned standard.
6module SchemeDoc.Parser.R7RS (scheme, parseFromFile)
7where
8
9import SchemeDoc.Error
10import SchemeDoc.Parser.Number
11import SchemeDoc.Parser.Util
12import SchemeDoc.Types
13
14import Control.Exception (throwIO)
15import Data.Char
16import qualified Data.Text as T
17
18import Text.Parsec.Char (endOfLine)
19import Text.ParserCombinators.Parsec hiding (parseFromFile, space, spaces, string)
20import qualified Text.ParserCombinators.Parsec as P
21
22-- Intraline whitespace.
23--
24-- <intraline whitespace> → <space or tab>
25--
26intraSpace :: Parser Char
27intraSpace = char ' ' <|> char '\t'
28
29-- Zero or more intraline whitespaces.
30intraSpaces :: Parser ()
31intraSpaces = skipMany intraSpace
32
33-- Whitespace.
34--
35-- <whitespace> → <intraline whitespace> | <line ending>
36--
37-- where
38--
39-- <line ending> → <newline> | <return> <newline> | <return>
40--
41space :: Parser Char
42space = intraSpace <|> endOfLine <|> char '\r'
43
44-- Zero or more whitespaces.
45spaces :: Parser ()
46spaces = skipMany space
47
48------------------------------------------------------------------------
49
50-- Inline hex escape sequence.
51--
52-- <inline hex escape> → \x<hex scalar value>;
53--
54inlineHexEsc :: Parser Char
55inlineHexEsc = between (P.string "\\x") (char ';') (fmap chr hex)
56
57-- Explicit sign notation for numeric types.
58--
59-- <explicit sign> → + | -
60--
61explicitSign :: Parser Char
62explicitSign = char '+' <|> char '-'
63
64-- Initial character for an identifier.
65--
66-- <initial> → <letter> | <special initial>
67--
68initial :: Parser Char
69initial = letter <|> specialInitial
70
71-- Special initial characters for symbols / identifiers.
72--
73-- <special initial> → ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~
74--
75-- TODO: R⁷RS allows for a Scheme implementation to support
76-- additional "extended identifier characters" the set below
77-- is just the minimal set of extended identifier characters.
78specialInitial :: Parser Char
79specialInitial = oneOf "!$%&*+-./:<=>?@^_~"
80
81-- A special subsequent symbol / identifier character.
82--
83-- <special subsequent> → <explicit sign> | . | @
84--
85specialSubsequent :: Parser Char
86specialSubsequent = explicitSign <|> char '.' <|> char '@'
87
88-- A subsequent character for an identifier.
89--
90-- <subsequent> → <initial> | <digit> | <special subsequent>
91--
92subsequent :: Parser Char
93subsequent = initial <|> digit <|> specialSubsequent
94
95-- Element of a Scheme symbol / identifier.
96--
97-- <symbol element> →
98-- <any character other than <vertical line> or \>
99-- | <inline hex escape> | <mnemonic escape> | \|
100--
101symbolElement :: Parser Char
102symbolElement =
103 noneOf "\\|"
104 <|> try inlineHexEsc
105 <|> mnemonicEsc
106
107-- Parse a Scheme mnemonic escape character.
108--
109-- <mnemonic escape> → \a | \b | \t | \n | \r
110--
111mnemonicEsc :: Parser Char
112mnemonicEsc =
113 char '\\'
114 >> ( bind "a" '\a' -- alarm
115 <|> bind "b" '\b' -- backspace
116 <|> bind "t" '\t' -- character tabulation
117 <|> bind "n" '\n' -- linefeed
118 <|> bind "r" '\r' -- return
119 )
120
121-- Parse a character in a string, including escape sequences.
122-- Returns Nothing for escaped newlines.
123--
124-- <string element> → <any character other than " or \>
125-- | <mnemonic escape> | \" | \\
126-- | \<intraline whitespace>*<line ending>
127-- <intraline whitespace>*
128-- | <inline hex escape>
129--
130stringElement :: Parser (Maybe Char)
131stringElement =
132 Just <$> noneOf "\"\\"
133 <|> Just <$> try mnemonicEsc
134 <|> try (bind "\\\"" (Just '"'))
135 <|> try (bind "\\\\" (Just '\\'))
136 <|> (Nothing <$ try (char '\\' >> intraSpaces >> char '\n' >> intraSpaces))
137 <|> Just <$> inlineHexEsc
138
139-- TODO: Add something along the lines of `choice-try` to clean this up.
140-- See the existing `choice` and `try` combinators provided by Parsec.
141
142-- Scheme character name.
143--
144-- <character name> → alarm | backspace | delete
145-- | escape | newline | null | return | space | tab
146--
147characterName :: Parser Char
148characterName =
149 bind "alarm" '\a'
150 <|> bind "backspace" '\b'
151 <|> bind "delete" '\DEL'
152 <|> bind "escape" '\ESC'
153 <|> bind "newline" '\n'
154 <|> bind "null" '\0'
155 <|> bind "return" '\r'
156 <|> bind "space" ' '
157 <|> bind "tab" '\t'
158
159-- Parse a Scheme delimiter.
160--
161-- <delimiter> → <whitespace> | <vertical line> | ( | ) | " | ;
162--
163-- Does not consume any input on failure.
164delimiter :: Parser ()
165delimiter = skip $ space <|> char '|' <|> char '(' <|> char ')' <|> char '"' <|> char ';'
166
167-- Scheme block comment.
168--
169-- <nested comment> → #| <comment text>
170-- <comment cont>* |#
171--
172-- XXX: Not sure if I understood the nested comment grammar rule correctly.
173nestedComment :: Parser String
174nestedComment = P.string "#|" >> manyTill anyChar (P.string "|#")
175
176-- Source code comment.
177--
178-- <comment> → ; <all subsequent characters up to a line ending>
179-- | <nested comment>
180-- | #; <intertoken space> <datum>
181--
182-- This parser intentionally ignores documentation comments.
183-- See the docComment parser below.
184comment :: Parser ()
185comment =
186 skip $
187 (char ';' >> notFollowedBy (P.string ";>") >> manyTill anyChar endOfLine)
188 <|> nestedComment
189
190-- TODO: #; comments
191
192------------------------------------------------------------------------
193
194-- A source code comment for documentation purposes.
195-- This is a custom SchemeDoc specific grammar rule.
196--
197-- <doc comment> → ;;> <all subsequent characters up to a line ending>
198--
199docComment :: Parser Sexp
200docComment =
201 -- Special case for section comments which must be followed by a
202 -- normal documentation comment and are thus one-line comments.
203 --
204 -- TODO: Maybe useful to have separate constructors for these.
205 (DocComment <$> (try (P.string ";;>|") >> (T.cons '|' <$> docChars)))
206 <|> (DocComment . T.concat <$> many1 (try (intraSpaces >> P.string ";;>") >> docChars))
207 where
208 docChars :: Parser T.Text
209 docChars = T.pack <$> manyTill' anyChar endOfLine
210
211-- Sign subsequent for peculiar identifier.
212--
213-- <sign subsequent> → <initial> | <explicit sign> | @
214--
215signSubsequent :: Parser String
216signSubsequent = do
217 first <- initial
218 sign <- explicitSign
219 ch <- char '@'
220 return [first, sign, ch]
221
222-- Dot subsequent for peculiar identifier.
223--
224-- | . <dot subsequent> <subsequent>*
225--
226dotSubsequent :: Parser String
227dotSubsequent = do
228 first <- char '.'
229 subsq <- many subsequent
230 return (first : subsq)
231
232-- Peculiar identifier.
233--
234-- <peculiar identifier> → <explicit sign>
235-- | <explicit sign> <sign subsequent> <subsequent>*
236-- | <explicit sign> . <dot subsequent> <subsequent>*
237-- | . <dot subsequent> <subsequent>*
238--
239peculiarIdentifier :: Parser String
240peculiarIdentifier =
241 fmap (: []) (try explicitSign)
242 <|> try
243 ( do
244 sign <- explicitSign
245 ssub <- signSubsequent
246 sub <- many subsequent
247 return ([sign] ++ ssub ++ sub)
248 )
249 <|> try
250 ( do
251 sign <- explicitSign
252 dsub <- dotSubsequent
253 sub <- many subsequent
254 return ([sign] ++ dsub ++ sub)
255 )
256 <|> try
257 ( do
258 dot <- char '.'
259 dsub <- dotSubsequent
260 sub <- many subsequent
261 return ([dot] ++ dsub ++ sub)
262 )
263
264-- Scheme identifier or symbol.
265--
266-- <identifier> → <initial> <subsequent>*
267-- | <vertical line> <symbol element>* <vertical line>
268-- | <peculiar identifier>
269--
270identifier :: Parser Sexp
271identifier =
272 fmap (Id . T.pack) $
273 (initial >>= (\i -> (:) i <$> many subsequent))
274 <|> between (char '|') (char '|') (many symbolElement)
275 <|> peculiarIdentifier
276
277-- Scheme boolean
278--
279-- <boolean> → #t | #f | #true | #false
280--
281boolean :: Parser Sexp
282boolean =
283 char '#'
284 >> ( Boolean True <$ (try (P.string "true") <|> P.string "t")
285 <|> Boolean False <$ (try (P.string "false") <|> P.string "f")
286 )
287
288-- Scheme character.
289--
290-- <character> → #\ <any character>
291-- | #\ <character name>
292-- | #\x<hex scalar value>
293--
294character :: Parser Sexp
295character =
296 fmap Char $
297 try (P.string "#\\" >> characterName)
298 <|> try (P.string "#\\x" >> fmap chr hex)
299 <|> try (P.string "#\\" >> anyChar)
300
301-- A Scheme String.
302--
303-- <string> → " <string element>* "
304--
305string :: Parser Sexp
306string =
307 Str . T.pack
308 <$> between (char '"') (char '"') (filterJust <$> many stringElement)
309
310-- Parse a list, e.g. (1 2 3).
311list :: Parser Sexp
312list =
313 List
314 <$> between (lexeme $ char '(') (char ')') (many sexp)
315
316-- Parse syntatic sugar for vectors, e.g. `#(1 2 3)`.
317vector :: Parser Sexp
318vector =
319 (\lst -> List $ Id "vector" : lst)
320 <$> between (lexeme $ P.string "#(") (char ')') (many sexp)
321
322-- Parse syntatic sugar for bytevectors, e.g. `#u8(1 2 3)`.
323bytevector :: Parser Sexp
324bytevector =
325 (\lst -> List $ Id "bytevector" : lst)
326 <$> between (lexeme $ P.string "#u8(") (char ')') (many sexp)
327
328-- -- Parse syntatic sugor for quotations, e.g. `'foo`.
329quote :: Parser Sexp
330quote =
331 fmap (\datum -> List [Id "quote", datum]) $
332 lexeme (char '\'') >> sexp'
333
334-- Parse syntatic sugar for quasiquotations, e.g. ``foo`.
335quasiquotation :: Parser Sexp
336quasiquotation =
337 fmap (\e -> List [Id "quasiquote", e]) $
338 lexeme (try $ char '`') >> sexp'
339
340-- Parse syntatic sugor for unquote, e.g. `,foo`.
341unquote :: Parser Sexp
342unquote =
343 fmap (\e -> List [Id "unquote", e]) $
344 lexeme (char ',') >> sexp'
345
346-- Parse syntatic sugar for unquote-splicing, e.g. `,@foo`.
347unquoteSplicing :: Parser Sexp
348unquoteSplicing =
349 fmap (\e -> List [Id "unquote-splicing", e]) $
350 lexeme (P.string ",@") >> sexp'
351
352-- Parse an S-Expression without lexing or delimiter handling
353-- according to the tokens defined in the R⁷RS formal syntax:
354--
355-- <token> → <identifier> | <boolean> | <number>
356-- | <character> | <string>
357-- | ( | ) | #( | #u8( | ’ | ` | , | ,@ | .
358--
359sexp' :: Parser Sexp
360sexp' =
361 identifier
362 <|> character
363 <|> number
364 <|> string
365 <|> list
366 <|> docComment
367 -- Boolean, comments, and {bit,}vectors all start with
368 -- a `#` character and thus require backtracking.
369 <|> try boolean
370 <|> try vector
371 <|> bytevector
372 -- XXX: Quotation tokens are ignored for now
373 <|> quote
374 <|> quasiquotation
375 <|> try unquoteSplicing
376 <|> unquote
377
378-- TODO: Directive (#!fold-case, …)
379
380-- Parse an s-expression with lexing and delimiter checking.
381sexp :: Parser Sexp
382sexp = lexeme $ terminatedBy sexp' (lookAhead (delim <|> eof))
383 where
384 delim :: Parser ()
385 delim = (delimiter <|> skip (P.string "#|") <|> eof) <?> "delimiter"
386
387-- “Comments are treated exactly like whitespace.”
388-- Comments require backtracking for docComment parser.
389lexComment :: Parser ()
390lexComment = skipMany (try comment >> spaces)
391
392-- Strip whitespaces and comments between tokens.
393lexeme :: Parser a -> Parser a
394lexeme p = do
395 r <- p
396 _ <- spaces
397 _ <- lexComment
398 return r
399
400-- | Parse a Scheme program as defined in the [R7RS-small](https://small.r7rs.org/) standard.
401scheme :: Parser [Sexp]
402scheme = do
403 _ <- spaces
404 _ <- lexComment
405 manyTill sexp eof
406
407-- TODO: Don't dupilcate lexeme here
408
409------------------------------------------------------------------------
410
411-- | Utility function for parsing a Scheme source code file.
412-- Behaves like 'Text.Parsec.String.parseFromFile' but unconditionally
413-- uses the 'scheme' parser and throws an 'ErrParser' exception on error.
414parseFromFile :: FilePath -> IO [Sexp]
415parseFromFile fileName = do
416 r <- P.parseFromFile scheme fileName
417 case r of
418 Left err -> throwIO $ ErrParser err
419 Right s -> pure s