1{-# LANGUAGE OverloadedStrings #-}23-- | This module implements parser combinators for the R7RS Scheme4-- language standard. The module implements a parser for the formal5-- syntax defined in Section 7.1 of the aformentioned standard.6module SchemeDoc.Parser.R7RS (scheme, parseFromFile)7where89import SchemeDoc.Error10import SchemeDoc.Parser.Number11import SchemeDoc.Parser.Util12import SchemeDoc.Types1314import Control.Exception (throwIO)15import Data.Char16import qualified Data.Text as T1718import Text.Parsec.Char (endOfLine)19import Text.ParserCombinators.Parsec hiding (parseFromFile, space, spaces, string)20import qualified Text.ParserCombinators.Parsec as P2122-- Intraline whitespace.23--24-- <intraline whitespace> → <space or tab>25--26intraSpace :: Parser Char27intraSpace = char ' ' <|> char '\t'2829-- Zero or more intraline whitespaces.30intraSpaces :: Parser ()31intraSpaces = skipMany intraSpace3233-- Whitespace.34--35-- <whitespace> → <intraline whitespace> | <line ending>36--37-- where38--39-- <line ending> → <newline> | <return> <newline> | <return>40--41space :: Parser Char42space = intraSpace <|> endOfLine <|> char '\r'4344-- Zero or more whitespaces.45spaces :: Parser ()46spaces = skipMany space4748------------------------------------------------------------------------4950-- Inline hex escape sequence.51--52-- <inline hex escape> → \x<hex scalar value>;53--54inlineHexEsc :: Parser Char55inlineHexEsc = between (P.string "\\x") (char ';') (fmap chr hex)5657-- Explicit sign notation for numeric types.58--59-- <explicit sign> → + | -60--61explicitSign :: Parser Char62explicitSign = char '+' <|> char '-'6364-- Initial character for an identifier.65--66-- <initial> → <letter> | <special initial>67--68initial :: Parser Char69initial = letter <|> specialInitial7071-- Special initial characters for symbols / identifiers.72--73-- <special initial> → ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~74--75-- TODO: R⁷RS allows for a Scheme implementation to support76-- additional "extended identifier characters" the set below77-- is just the minimal set of extended identifier characters.78specialInitial :: Parser Char79specialInitial = oneOf "!$%&*+-./:<=>?@^_~"8081-- A special subsequent symbol / identifier character.82--83-- <special subsequent> → <explicit sign> | . | @84--85specialSubsequent :: Parser Char86specialSubsequent = explicitSign <|> char '.' <|> char '@'8788-- A subsequent character for an identifier.89--90-- <subsequent> → <initial> | <digit> | <special subsequent>91--92subsequent :: Parser Char93subsequent = initial <|> digit <|> specialSubsequent9495-- Element of a Scheme symbol / identifier.96--97-- <symbol element> →98-- <any character other than <vertical line> or \>99-- | <inline hex escape> | <mnemonic escape> | \|100--101symbolElement :: Parser Char102symbolElement =103 noneOf "\\|"104 <|> try inlineHexEsc105 <|> mnemonicEsc106107-- Parse a Scheme mnemonic escape character.108--109-- <mnemonic escape> → \a | \b | \t | \n | \r110--111mnemonicEsc :: Parser Char112mnemonicEsc =113 char '\\'114 >> ( bind "a" '\a' -- alarm115 <|> bind "b" '\b' -- backspace116 <|> bind "t" '\t' -- character tabulation117 <|> bind "n" '\n' -- linefeed118 <|> bind "r" '\r' -- return119 )120121-- Parse a character in a string, including escape sequences.122-- Returns Nothing for escaped newlines.123--124-- <string element> → <any character other than " or \>125-- | <mnemonic escape> | \" | \\126-- | \<intraline whitespace>*<line ending>127-- <intraline whitespace>*128-- | <inline hex escape>129--130stringElement :: Parser (Maybe Char)131stringElement =132 Just <$> noneOf "\"\\"133 <|> Just <$> try mnemonicEsc134 <|> try (bind "\\\"" (Just '"'))135 <|> try (bind "\\\\" (Just '\\'))136 <|> (Nothing <$ try (char '\\' >> intraSpaces >> char '\n' >> intraSpaces))137 <|> Just <$> inlineHexEsc138139-- TODO: Add something along the lines of `choice-try` to clean this up.140-- See the existing `choice` and `try` combinators provided by Parsec.141142-- Scheme character name.143--144-- <character name> → alarm | backspace | delete145-- | escape | newline | null | return | space | tab146--147characterName :: Parser Char148characterName =149 bind "alarm" '\a'150 <|> bind "backspace" '\b'151 <|> bind "delete" '\DEL'152 <|> bind "escape" '\ESC'153 <|> bind "newline" '\n'154 <|> bind "null" '\0'155 <|> bind "return" '\r'156 <|> bind "space" ' '157 <|> bind "tab" '\t'158159-- Parse a Scheme delimiter.160--161-- <delimiter> → <whitespace> | <vertical line> | ( | ) | " | ;162--163-- Does not consume any input on failure.164delimiter :: Parser ()165delimiter = skip $ space <|> char '|' <|> char '(' <|> char ')' <|> char '"' <|> char ';'166167-- Scheme block comment.168--169-- <nested comment> → #| <comment text>170-- <comment cont>* |#171--172-- XXX: Not sure if I understood the nested comment grammar rule correctly.173nestedComment :: Parser String174nestedComment = P.string "#|" >> manyTill anyChar (P.string "|#")175176-- Source code comment.177--178-- <comment> → ; <all subsequent characters up to a line ending>179-- | <nested comment>180-- | #; <intertoken space> <datum>181--182-- This parser intentionally ignores documentation comments.183-- See the docComment parser below.184comment :: Parser ()185comment =186 skip $187 (char ';' >> notFollowedBy (P.string ";>") >> manyTill anyChar endOfLine)188 <|> nestedComment189190-- TODO: #; comments191192------------------------------------------------------------------------193194-- A source code comment for documentation purposes.195-- This is a custom SchemeDoc specific grammar rule.196--197-- <doc comment> → ;;> <all subsequent characters up to a line ending>198--199docComment :: Parser Sexp200docComment =201 -- Special case for section comments which must be followed by a202 -- normal documentation comment and are thus one-line comments.203 --204 -- TODO: Maybe useful to have separate constructors for these.205 (DocComment <$> (try (P.string ";;>|") >> (T.cons '|' <$> docChars)))206 <|> (DocComment . T.concat <$> many1 (try (intraSpaces >> P.string ";;>") >> docChars))207 where208 docChars :: Parser T.Text209 docChars = T.pack <$> manyTill' anyChar endOfLine210211-- Sign subsequent for peculiar identifier.212--213-- <sign subsequent> → <initial> | <explicit sign> | @214--215signSubsequent :: Parser String216signSubsequent = do217 first <- initial218 sign <- explicitSign219 ch <- char '@'220 return [first, sign, ch]221222-- Dot subsequent for peculiar identifier.223--224-- | . <dot subsequent> <subsequent>*225--226dotSubsequent :: Parser String227dotSubsequent = do228 first <- char '.'229 subsq <- many subsequent230 return (first : subsq)231232-- Peculiar identifier.233--234-- <peculiar identifier> → <explicit sign>235-- | <explicit sign> <sign subsequent> <subsequent>*236-- | <explicit sign> . <dot subsequent> <subsequent>*237-- | . <dot subsequent> <subsequent>*238--239peculiarIdentifier :: Parser String240peculiarIdentifier =241 fmap (: []) (try explicitSign)242 <|> try243 ( do244 sign <- explicitSign245 ssub <- signSubsequent246 sub <- many subsequent247 return ([sign] ++ ssub ++ sub)248 )249 <|> try250 ( do251 sign <- explicitSign252 dsub <- dotSubsequent253 sub <- many subsequent254 return ([sign] ++ dsub ++ sub)255 )256 <|> try257 ( do258 dot <- char '.'259 dsub <- dotSubsequent260 sub <- many subsequent261 return ([dot] ++ dsub ++ sub)262 )263264-- Scheme identifier or symbol.265--266-- <identifier> → <initial> <subsequent>*267-- | <vertical line> <symbol element>* <vertical line>268-- | <peculiar identifier>269--270identifier :: Parser Sexp271identifier =272 fmap (Id . T.pack) $273 (initial >>= (\i -> (:) i <$> many subsequent))274 <|> between (char '|') (char '|') (many symbolElement)275 <|> peculiarIdentifier276277-- Scheme boolean278--279-- <boolean> → #t | #f | #true | #false280--281boolean :: Parser Sexp282boolean =283 char '#'284 >> ( Boolean True <$ (try (P.string "true") <|> P.string "t")285 <|> Boolean False <$ (try (P.string "false") <|> P.string "f")286 )287288-- Scheme character.289--290-- <character> → #\ <any character>291-- | #\ <character name>292-- | #\x<hex scalar value>293--294character :: Parser Sexp295character =296 fmap Char $297 try (P.string "#\\" >> characterName)298 <|> try (P.string "#\\x" >> fmap chr hex)299 <|> try (P.string "#\\" >> anyChar)300301-- A Scheme String.302--303-- <string> → " <string element>* "304--305string :: Parser Sexp306string =307 Str . T.pack308 <$> between (char '"') (char '"') (filterJust <$> many stringElement)309310-- Parse a list, e.g. (1 2 3).311list :: Parser Sexp312list =313 List314 <$> between (lexeme $ char '(') (char ')') (many sexp)315316-- Parse syntatic sugar for vectors, e.g. `#(1 2 3)`.317vector :: Parser Sexp318vector =319 (\lst -> List $ Id "vector" : lst)320 <$> between (lexeme $ P.string "#(") (char ')') (many sexp)321322-- Parse syntatic sugar for bytevectors, e.g. `#u8(1 2 3)`.323bytevector :: Parser Sexp324bytevector =325 (\lst -> List $ Id "bytevector" : lst)326 <$> between (lexeme $ P.string "#u8(") (char ')') (many sexp)327328-- -- Parse syntatic sugor for quotations, e.g. `'foo`.329quote :: Parser Sexp330quote =331 fmap (\datum -> List [Id "quote", datum]) $332 lexeme (char '\'') >> sexp'333334-- Parse syntatic sugar for quasiquotations, e.g. ``foo`.335quasiquotation :: Parser Sexp336quasiquotation =337 fmap (\e -> List [Id "quasiquote", e]) $338 lexeme (try $ char '`') >> sexp'339340-- Parse syntatic sugor for unquote, e.g. `,foo`.341unquote :: Parser Sexp342unquote =343 fmap (\e -> List [Id "unquote", e]) $344 lexeme (char ',') >> sexp'345346-- Parse syntatic sugar for unquote-splicing, e.g. `,@foo`.347unquoteSplicing :: Parser Sexp348unquoteSplicing =349 fmap (\e -> List [Id "unquote-splicing", e]) $350 lexeme (P.string ",@") >> sexp'351352-- Parse an S-Expression without lexing or delimiter handling353-- according to the tokens defined in the R⁷RS formal syntax:354--355-- <token> → <identifier> | <boolean> | <number>356-- | <character> | <string>357-- | ( | ) | #( | #u8( | ’ | ` | , | ,@ | .358--359sexp' :: Parser Sexp360sexp' =361 identifier362 <|> character363 <|> number364 <|> string365 <|> list366 <|> docComment367 -- Boolean, comments, and {bit,}vectors all start with368 -- a `#` character and thus require backtracking.369 <|> try boolean370 <|> try vector371 <|> bytevector372 -- XXX: Quotation tokens are ignored for now373 <|> quote374 <|> quasiquotation375 <|> try unquoteSplicing376 <|> unquote377378-- TODO: Directive (#!fold-case, …)379380-- Parse an s-expression with lexing and delimiter checking.381sexp :: Parser Sexp382sexp = lexeme $ terminatedBy sexp' (lookAhead (delim <|> eof))383 where384 delim :: Parser ()385 delim = (delimiter <|> skip (P.string "#|") <|> eof) <?> "delimiter"386387-- “Comments are treated exactly like whitespace.”388-- Comments require backtracking for docComment parser.389lexComment :: Parser ()390lexComment = skipMany (try comment >> spaces)391392-- Strip whitespaces and comments between tokens.393lexeme :: Parser a -> Parser a394lexeme p = do395 r <- p396 _ <- spaces397 _ <- lexComment398 return r399400-- | Parse a Scheme program as defined in the [R7RS-small](https://small.r7rs.org/) standard.401scheme :: Parser [Sexp]402scheme = do403 _ <- spaces404 _ <- lexComment405 manyTill sexp eof406407-- TODO: Don't dupilcate lexeme here408409------------------------------------------------------------------------410411-- | Utility function for parsing a Scheme source code file.412-- Behaves like 'Text.Parsec.String.parseFromFile' but unconditionally413-- uses the 'scheme' parser and throws an 'ErrParser' exception on error.414parseFromFile :: FilePath -> IO [Sexp]415parseFromFile fileName = do416 r <- P.parseFromFile scheme fileName417 case r of418 Left err -> throwIO $ ErrParser err419 Right s -> pure s