1/*2 * Copyright © 2016-2019 Sören Tempel3 *4 * This program is free software: you can redistribute it and/or5 * modify it under the terms of the GNU Affero General Public6 * License as published by the Free Software Foundation, either7 * version 3 of the License, or (at your option) any later version.8 *9 * This program is distributed in the hope that it will be useful,10 * but WITHOUT ANY WARRANTY; without even the implied warranty of11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU12 * Affero General Public License for more details.13 *14 * You should have received a copy of the GNU Affero General Public15 * License along with this program. If not, see16 * <http://www.gnu.org/licenses/>.17 */1819#include <assert.h>20#include <ctype.h>21#include <errno.h>22#include <limits.h>23#include <pthread.h>24#include <stdio.h>25#include <stdlib.h>26#include <string.h>2728#include <sys/types.h>2930#include "queue.h"31#include "scanner.h"32#include "token.h"33#include "turing.h"34#include "util.h"3536static void lexspace(scanner *);37static void lexcomment(scanner *);38static void lexstate(scanner *);39static void lexterm(scanner *);4041/**42 * Reads the next character from the input string and increments current43 * position of the scanner in the input string.44 *45 * @param scr Scanner from which character should be read.46 * @returns The next character or -1 if there is none.47 */48static int49nextch(scanner *scr)50{51 if (scr->pos >= scr->inlen)52 return -1;5354 return scr->input[scr->pos++];55}5657/**58 * Reads the next character from the input string but doesn't increment59 * the current position of the scanner.60 *61 * @param scr Scanner from which character should be read.62 * @returns The next character or -1 if there is none.63 */64static int65peekch(scanner *scr)66{67 int nxt;6869 nxt = nextch(scr);70 if (nxt != -1)71 scr->pos--;72 return nxt;73}7475/**76 * Ignores all characters in the input string after the current start position.77 *78 * @param scr Scanner for which characters should be ignored.79 */80static void81ignore(scanner *scr)82{83 scr->start = scr->pos;84}8586/**87 * Whether or not the current character is a valid turing machine input88 * symbol. Which is the case if it is either an alphanumeric character89 * or the special blank character.90 *91 * @param c Character which should be examined.92 * @returns Non-zero integer if it is, zero if it isn't.93 */94static int95issymbol(int c)96{97 return isalnum(c) || c == BLANKCHAR;98}99100/**101 * Emits a new token and enqueues it.102 *103 * @param scr Scanner which found the token that should be emitted.104 * @param tkt Type of the token.105 * @param value Value of the token.106 */107static void108emit(scanner *scr, toktype tkt, int value)109{110 token *tok;111112 tok = emalloc(sizeof(token));113 tok->type = tkt;114 tok->line = scr->line;115 tok->column = scr->column;116 tok->value = value;117118 enqueue(scr->tqueue, tok);119 scr->start = scr->pos;120}121122/**123 * Tail recursive function parsing the entire input string.124 * Usually passed to pthread_create(3).125 *126 * @param scr Pointer to the associated scanner.127 */128static void129lexany(scanner *scr)130{131 int nxt;132133 /* Make this function a cancel point. */134 pthread_testcancel();135136 scr->column++;137 if ((nxt = nextch(scr)) == -1) {138 scr->column = 0;139 if (scr->line > 1)140 scr->line--;141142 emit(scr, TOK_EOF, TOKNOP);143 LEXRET(scr, NULL);144 }145146 switch (nxt) {147 case '\n':148 scr->column = 0;149 scr->line++;150 ignore(scr);151 LEXRET(scr, lexany);152 case '#':153 LEXRET(scr, lexcomment);154 case ',':155 emit(scr, TOK_COMMA, TOKAUTO(scr));156 LEXRET(scr, lexany);157 case ';':158 emit(scr, TOK_SEMICOLON, TOKAUTO(scr));159 LEXRET(scr, lexany);160 case '{':161 emit(scr, TOK_LBRACKET, TOKAUTO(scr));162 LEXRET(scr, lexany);163 case '}':164 emit(scr, TOK_RBRACKET, TOKAUTO(scr));165 LEXRET(scr, lexany);166 case '<':167 emit(scr, TOK_SMALLER, TOKAUTO(scr));168 LEXRET(scr, lexany);169 case '>':170 emit(scr, TOK_GREATER, TOKAUTO(scr));171 LEXRET(scr, lexany);172 case '|':173 emit(scr, TOK_PIPE, TOKAUTO(scr));174 LEXRET(scr, lexany);175 case 'q':176 if (isdigit(peekch(scr)))177 LEXRET(scr, lexstate);178 break;179 case '=':180 if (nextch(scr) == '>')181 emit(scr, TOK_NEXT, TOKNOP);182 else183 emit(scr, TOK_ERROR, ERR_UNKOWN);184185 scr->column++;186 LEXRET(scr, lexany);187 }188189 if (issymbol(nxt) && !issymbol(peekch(scr))) {190 emit(scr, TOK_SYMBOL, TOKAUTO(scr));191 LEXRET(scr, lexany);192 } else if (isspace(nxt)) {193 LEXRET(scr, lexspace);194 }195196 LEXRET(scr, lexterm);197}198199/**200 * Skips/Ignores all white-space characters. Also201 * calls lexany as soon as it encounters the first202 * non-white-space character.203 *204 * @param scr Pointer to the associated scanner.205 */206static void207lexspace(scanner *scr)208{209 while (isspace(peekch(scr))) {210 nextch(scr);211 scr->column++;212 }213214 ignore(scr);215 LEXRET(scr, lexany);216}217218/**219 * Skips/Ignores a comment. A comment begins with the220 * ASCII symbol '#' and ends with a newline.221 *222 * @param scr Pointer to the associated scanner.223 */224static void225lexcomment(scanner *scr)226{227 while (peekch(scr) != '\n')228 if (nextch(scr) == -1)229 break;230231 /* We don't need to increment scr->column here because232 * comments are currently not exposed to the parser233 * and since they end with a newline they don't effect234 * other tokens either (unlike white-spaces). */235236 ignore(scr);237 LEXRET(scr, lexany);238}239240/**241 * Lexes a state name. Also calls lexany as soon as242 * it encounters a non-digit character. Besides an243 * error token is emitted if converting the token244 * name to an int would cause an integer over/underflow.245 *246 * @pre The previous char must be the ASCII character 'q'.247 * @param scr Pointer to the associated scanner.248 */249static void250lexstate(scanner *scr)251{252 char *input, buf[STATELEN];253 unsigned int col;254 size_t len;255 long value;256257 input = &scr->input[scr->pos];258259 col = scr->column;260 while (isdigit(peekch(scr)))261 nextch(scr);262 scr->column = col;263264 len = scr->pos - scr->start - 1;265 if (len > STATELEN - 1) {266 emit(scr, TOK_ERROR, ERR_OVERFLOW);267 goto ret;268 }269270 memcpy(buf, input, len);271 buf[len] = '\0';272273 value = strtol(buf, NULL, 10);274 if (value < 0)275 emit(scr, TOK_ERROR, ERR_UNDERFLOW);276 else if (value > INT_MAX)277 emit(scr, TOK_ERROR, ERR_OVERFLOW);278 else279 emit(scr, TOK_STATE, (int)value);280281ret:282 scr->column += (unsigned int)len; /* Initial 'q' and digits. */283 LEXRET(scr, lexany);284}285286/**287 * Attempts to lex the terminal symbols `start:` and `accept:`. It calls288 * ::lexany as soon as it reaches the end of the given terminal symbol289 * or if a character is encountered which is not part of the any known290 * terminal symbol.291 *292 * @param scr Pointer to the associated scanner.293 */294static void295lexterm(scanner *scr)296{297 char *ter;298 toktype tkt;299 size_t pos, len;300301 switch (scr->input[scr->start]) {302 case 's':303 tkt = TOK_START;304 ter = "start:";305 break;306 case 'a':307 tkt = TOK_ACCEPT;308 ter = "accept:";309 break;310 default:311 emit(scr, TOK_ERROR, ERR_UNKOWN);312 LEXRET(scr, lexany);313 }314315 len = strlen(ter);316 if (!xstrncmp(ter, &scr->input[scr->start], len, &pos)) {317 emit(scr, tkt, TOKNOP);318 } else {319 /* pos should always be < strlen({start:,accept:}) */320 assert(pos <= UINT_MAX - 1);321322 scr->column += (unsigned int)pos;323 emit(scr, TOK_ERROR, ERR_UNEXPECTED);324 }325326 scr->pos += --len;327 scr->start = scr->pos;328329 scr->column += (unsigned int)len;330 LEXRET(scr, lexany);331}332333/**334 * Function invoked by a seperate LWP for parsing the input string. This335 * function simply calls the current ::statefn until the called336 * ::statefn function sets the current state to `NULL`.337 *338 * @param pscr Void pointer to the scanner used for lexing.339 * @returns A null pointer.340 */341static void *342tokloop(void *pscr)343{344 scanner *scr;345346 scr = (scanner *)pscr;347 while (scr->state != NULL)348 (*scr->state)(scr); /* fn must set scr->state. */349350 return NULL;351}352353/**354 * Creates a new scanner for the given input and starts lexing the355 * it in a seperated thread.356 *357 * @param input Input which should be scanned.358 * @param len Length of the input.359 * @returns Scanner for the given input.360 */361scanner *362scanstr(char *input, size_t len)363{364 scanner *scr;365366 scr = emalloc(sizeof(scanner));367 scr->tqueue = newqueue();368 scr->state = lexany;369 scr->pos = scr->start = scr->column = 0;370 scr->inlen = len;371 scr->input = input;372 scr->line = 1;373374 if ((errno = pthread_create(&scr->thread, NULL, tokloop, (void *)scr)))375 die("pthread_create failed");376377 return scr;378}379380/**381 * Frees the allocated memory for the given scanner.382 *383 * @param scr Scanner for which allocated memory should be freed.384 */385void386freescanner(scanner *scr)387{388 assert(scr);389390 if (!pthread_cancel(scr->thread)) {391 if ((errno = pthread_join(scr->thread, NULL)))392 die("pthread_join failed");393 }394395 freequeue(scr->tqueue);396 free(scr);397}398399/**400 * Returns the least recent token scanned by the given scanner. If no401 * token has been scanned so for this function blocks until a token has402 * been emitted by the scanner. If the last token (TOK_EOF) was already403 * returned by this function then it causes a deadlock.404 *405 * @pre A previous call of this function didn't return TOK_EOF.406 * @param scr Scanner to extract token from.407 */408token *409nexttoken(scanner *scr)410{411 return dequeue(scr->tqueue);412}